use crate::{
    arch::asm,
    core_arch::{simd::*, simd_llvm::*, x86::*},
    mem::{self, transmute},
    ptr,
};

// x86-32 wants to use a 32-bit address size, but asm! defaults to using the full
// register name (e.g. rax). We have to explicitly override the placeholder to
// use the 32-bit register name in that case.

#[cfg(target_pointer_width = "32")]
macro_rules! vpl {
    ($inst:expr) => {
        concat!($inst, ", [{p:e}]")
    };
}
#[cfg(target_pointer_width = "64")]
macro_rules! vpl {
    ($inst:expr) => {
        concat!($inst, ", [{p}]")
    };
}
#[cfg(target_pointer_width = "32")]
macro_rules! vps {
    ($inst1:expr, $inst2:expr) => {
        concat!($inst1, " [{p:e}]", $inst2)
    };
}
#[cfg(target_pointer_width = "64")]
macro_rules! vps {
    ($inst1:expr, $inst2:expr) => {
        concat!($inst1, " [{p}]", $inst2)
    };
}

pub(crate) use {vpl, vps};

#[cfg(test)]
use stdarch_test::assert_instr;

/// Computes the absolute values of packed 32-bit integers in `a`.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_abs_epi32&expand=39)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpabsd))]
pub unsafe fn _mm512_abs_epi32(a: __m512i) -> __m512i {
    let a = a.as_i32x16();
    // all-0 is a properly initialized i32x16
    let zero: i32x16 = mem::zeroed();
    let sub = simd_sub(zero, a);
    let cmp: i32x16 = simd_gt(a, zero);
    transmute(simd_select(cmp, a, sub))
}

/// Computes the absolute value of packed 32-bit integers in `a`, and store the
/// unsigned results in `dst` using writemask `k` (elements are copied from
/// `src` when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_abs_epi32&expand=40)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpabsd))]
pub unsafe fn _mm512_mask_abs_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
    let abs = _mm512_abs_epi32(a).as_i32x16();
    transmute(simd_select_bitmask(k, abs, src.as_i32x16()))
}

/// Computes the absolute value of packed 32-bit integers in `a`, and store the
/// unsigned results in `dst` using zeromask `k` (elements are zeroed out when
/// the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_abs_epi32&expand=41)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpabsd))]
pub unsafe fn _mm512_maskz_abs_epi32(k: __mmask16, a: __m512i) -> __m512i {
    let abs = _mm512_abs_epi32(a).as_i32x16();
    let zero = _mm512_setzero_si512().as_i32x16();
    transmute(simd_select_bitmask(k, abs, zero))
}

/// Compute the absolute value of packed signed 32-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_abs_epi32&expand=37)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpabsd))]
pub unsafe fn _mm256_mask_abs_epi32(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
    let abs = _mm256_abs_epi32(a).as_i32x8();
    transmute(simd_select_bitmask(k, abs, src.as_i32x8()))
}

/// Compute the absolute value of packed signed 32-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_abs_epi32&expand=38)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpabsd))]
pub unsafe fn _mm256_maskz_abs_epi32(k: __mmask8, a: __m256i) -> __m256i {
    let abs = _mm256_abs_epi32(a).as_i32x8();
    let zero = _mm256_setzero_si256().as_i32x8();
    transmute(simd_select_bitmask(k, abs, zero))
}

/// Compute the absolute value of packed signed 32-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_abs_epi32&expand=34)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpabsd))]
pub unsafe fn _mm_mask_abs_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
    let abs = _mm_abs_epi32(a).as_i32x4();
    transmute(simd_select_bitmask(k, abs, src.as_i32x4()))
}

/// Compute the absolute value of packed signed 32-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_abs_epi32&expand=35)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpabsd))]
pub unsafe fn _mm_maskz_abs_epi32(k: __mmask8, a: __m128i) -> __m128i {
    let abs = _mm_abs_epi32(a).as_i32x4();
    let zero = _mm_setzero_si128().as_i32x4();
    transmute(simd_select_bitmask(k, abs, zero))
}

/// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_abs_epi64&expand=48)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpabsq))]
pub unsafe fn _mm512_abs_epi64(a: __m512i) -> __m512i {
    let a = a.as_i64x8();
    // all-0 is a properly initialized i64x8
    let zero: i64x8 = mem::zeroed();
    let sub = simd_sub(zero, a);
    let cmp: i64x8 = simd_gt(a, zero);
    transmute(simd_select(cmp, a, sub))
}

/// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_abs_epi64&expand=49)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpabsq))]
pub unsafe fn _mm512_mask_abs_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
    let abs = _mm512_abs_epi64(a).as_i64x8();
    transmute(simd_select_bitmask(k, abs, src.as_i64x8()))
}

/// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_abs_epi64&expand=50)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpabsq))]
pub unsafe fn _mm512_maskz_abs_epi64(k: __mmask8, a: __m512i) -> __m512i {
    let abs = _mm512_abs_epi64(a).as_i64x8();
    let zero = _mm512_setzero_si512().as_i64x8();
    transmute(simd_select_bitmask(k, abs, zero))
}

/// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_abs_epi64&expand=45)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpabsq))]
pub unsafe fn _mm256_abs_epi64(a: __m256i) -> __m256i {
    let a = a.as_i64x4();
    // all-0 is a properly initialized i64x4
    let zero: i64x4 = mem::zeroed();
    let sub = simd_sub(zero, a);
    let cmp: i64x4 = simd_gt(a, zero);
    transmute(simd_select(cmp, a, sub))
}

/// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_abs_epi64&expand=46)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpabsq))]
pub unsafe fn _mm256_mask_abs_epi64(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
    let abs = _mm256_abs_epi64(a).as_i64x4();
    transmute(simd_select_bitmask(k, abs, src.as_i64x4()))
}

/// Compute the absolute value of packed signed 64-bit integers in a, and store the unsigned results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_abs_epi64&expand=45)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpabsq))]
pub unsafe fn _mm256_maskz_abs_epi64(k: __mmask8, a: __m256i) -> __m256i {
    let abs = _mm256_abs_epi64(a).as_i64x4();
    let zero = _mm256_setzero_si256().as_i64x4();
    transmute(simd_select_bitmask(k, abs, zero))
}

/// Finds the absolute value of each packed single-precision (32-bit) floating-point element in v2, storing the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_abs_ps&expand=65)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpandq))]
pub unsafe fn _mm512_abs_ps(v2: __m512) -> __m512 {
    let a = _mm512_set1_epi32(0x7FFFFFFF); // from LLVM code
    let b = transmute::<f32x16, __m512i>(v2.as_f32x16());
    let abs = _mm512_and_epi32(a, b);
    transmute(abs)
}

/// Finds the absolute value of each packed single-precision (32-bit) floating-point element in v2, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_abs_ps&expand=66)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpandd))]
pub unsafe fn _mm512_mask_abs_ps(src: __m512, k: __mmask16, v2: __m512) -> __m512 {
    let abs = _mm512_abs_ps(v2).as_f32x16();
    transmute(simd_select_bitmask(k, abs, src.as_f32x16()))
}

/// Finds the absolute value of each packed double-precision (64-bit) floating-point element in v2, storing the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_abs_pd&expand=60)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpandq))]
pub unsafe fn _mm512_abs_pd(v2: __m512d) -> __m512d {
    let a = _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF); // from LLVM code
    let b = transmute::<f64x8, __m512i>(v2.as_f64x8());
    let abs = _mm512_and_epi64(a, b);
    transmute(abs)
}

/// Finds the absolute value of each packed double-precision (64-bit) floating-point element in v2, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_abs_pd&expand=61)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpandq))]
pub unsafe fn _mm512_mask_abs_pd(src: __m512d, k: __mmask8, v2: __m512d) -> __m512d {
    let abs = _mm512_abs_pd(v2).as_f64x8();
    transmute(simd_select_bitmask(k, abs, src.as_f64x8()))
}

/// Move packed 32-bit integers from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mov_epi32&expand=3801)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmovdqa32))]
pub unsafe fn _mm512_mask_mov_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
    let mov = a.as_i32x16();
    transmute(simd_select_bitmask(k, mov, src.as_i32x16()))
}

/// Move packed 32-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mov_epi32&expand=3802)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmovdqa32))]
pub unsafe fn _mm512_maskz_mov_epi32(k: __mmask16, a: __m512i) -> __m512i {
    let mov = a.as_i32x16();
    let zero = _mm512_setzero_si512().as_i32x16();
    transmute(simd_select_bitmask(k, mov, zero))
}

/// Move packed 32-bit integers from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_mov_epi32&expand=3799)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmovdqa32))]
pub unsafe fn _mm256_mask_mov_epi32(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
    let mov = a.as_i32x8();
    transmute(simd_select_bitmask(k, mov, src.as_i32x8()))
}

/// Move packed 32-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_mov_epi32&expand=3800)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmovdqa32))]
pub unsafe fn _mm256_maskz_mov_epi32(k: __mmask8, a: __m256i) -> __m256i {
    let mov = a.as_i32x8();
    let zero = _mm256_setzero_si256().as_i32x8();
    transmute(simd_select_bitmask(k, mov, zero))
}

/// Move packed 32-bit integers from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mov_epi32&expand=3797)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmovdqa32))]
pub unsafe fn _mm_mask_mov_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
    let mov = a.as_i32x4();
    transmute(simd_select_bitmask(k, mov, src.as_i32x4()))
}

/// Move packed 32-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mov_epi32&expand=3798)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmovdqa32))]
pub unsafe fn _mm_maskz_mov_epi32(k: __mmask8, a: __m128i) -> __m128i {
    let mov = a.as_i32x4();
    let zero = _mm_setzero_si128().as_i32x4();
    transmute(simd_select_bitmask(k, mov, zero))
}

/// Move packed 64-bit integers from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mov_epi64&expand=3807)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmovdqa64))]
pub unsafe fn _mm512_mask_mov_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
    let mov = a.as_i64x8();
    transmute(simd_select_bitmask(k, mov, src.as_i64x8()))
}

/// Move packed 64-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mov_epi64&expand=3808)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmovdqa64))]
pub unsafe fn _mm512_maskz_mov_epi64(k: __mmask8, a: __m512i) -> __m512i {
    let mov = a.as_i64x8();
    let zero = _mm512_setzero_si512().as_i64x8();
    transmute(simd_select_bitmask(k, mov, zero))
}

/// Move packed 64-bit integers from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_mov_epi64&expand=3805)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmovdqa64))]
pub unsafe fn _mm256_mask_mov_epi64(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
    let mov = a.as_i64x4();
    transmute(simd_select_bitmask(k, mov, src.as_i64x4()))
}

/// Move packed 64-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_mov_epi64&expand=3806)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmovdqa64))]
pub unsafe fn _mm256_maskz_mov_epi64(k: __mmask8, a: __m256i) -> __m256i {
    let mov = a.as_i64x4();
    let zero = _mm256_setzero_si256().as_i64x4();
    transmute(simd_select_bitmask(k, mov, zero))
}

/// Move packed 64-bit integers from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mov_epi64&expand=3803)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmovdqa64))]
pub unsafe fn _mm_mask_mov_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
    let mov = a.as_i64x2();
    transmute(simd_select_bitmask(k, mov, src.as_i64x2()))
}

/// Move packed 64-bit integers from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mov_epi64&expand=3804)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmovdqa64))]
pub unsafe fn _mm_maskz_mov_epi64(k: __mmask8, a: __m128i) -> __m128i {
    let mov = a.as_i64x2();
    let zero = _mm_setzero_si128().as_i64x2();
    transmute(simd_select_bitmask(k, mov, zero))
}

/// Move packed single-precision (32-bit) floating-point elements from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mov_ps&expand=3825)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmovaps))]
pub unsafe fn _mm512_mask_mov_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
    let mov = a.as_f32x16();
    transmute(simd_select_bitmask(k, mov, src.as_f32x16()))
}

/// Move packed single-precision (32-bit) floating-point elements from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mov_ps&expand=3826)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmovaps))]
pub unsafe fn _mm512_maskz_mov_ps(k: __mmask16, a: __m512) -> __m512 {
    let mov = a.as_f32x16();
    let zero = _mm512_setzero_ps().as_f32x16();
    transmute(simd_select_bitmask(k, mov, zero))
}

/// Move packed single-precision (32-bit) floating-point elements from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_mov_ps&expand=3823)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmovaps))]
pub unsafe fn _mm256_mask_mov_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
    let mov = a.as_f32x8();
    transmute(simd_select_bitmask(k, mov, src.as_f32x8()))
}

/// Move packed single-precision (32-bit) floating-point elements from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_mov_ps&expand=3824)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmovaps))]
pub unsafe fn _mm256_maskz_mov_ps(k: __mmask8, a: __m256) -> __m256 {
    let mov = a.as_f32x8();
    let zero = _mm256_setzero_ps().as_f32x8();
    transmute(simd_select_bitmask(k, mov, zero))
}

/// Move packed single-precision (32-bit) floating-point elements from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mov_ps&expand=3821)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmovaps))]
pub unsafe fn _mm_mask_mov_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
    let mov = a.as_f32x4();
    transmute(simd_select_bitmask(k, mov, src.as_f32x4()))
}

/// Move packed single-precision (32-bit) floating-point elements from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mov_ps&expand=3822)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmovaps))]
pub unsafe fn _mm_maskz_mov_ps(k: __mmask8, a: __m128) -> __m128 {
    let mov = a.as_f32x4();
    let zero = _mm_setzero_ps().as_f32x4();
    transmute(simd_select_bitmask(k, mov, zero))
}

/// Move packed double-precision (64-bit) floating-point elements from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mov_pd&expand=3819)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmovapd))]
pub unsafe fn _mm512_mask_mov_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
    let mov = a.as_f64x8();
    transmute(simd_select_bitmask(k, mov, src.as_f64x8()))
}

/// Move packed double-precision (64-bit) floating-point elements from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mov_pd&expand=3820)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmovapd))]
pub unsafe fn _mm512_maskz_mov_pd(k: __mmask8, a: __m512d) -> __m512d {
    let mov = a.as_f64x8();
    let zero = _mm512_setzero_pd().as_f64x8();
    transmute(simd_select_bitmask(k, mov, zero))
}

/// Move packed double-precision (64-bit) floating-point elements from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_mov_pd&expand=3817)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmovapd))]
pub unsafe fn _mm256_mask_mov_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
    let mov = a.as_f64x4();
    transmute(simd_select_bitmask(k, mov, src.as_f64x4()))
}

/// Move packed double-precision (64-bit) floating-point elements from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_mov_pd&expand=3818)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmovapd))]
pub unsafe fn _mm256_maskz_mov_pd(k: __mmask8, a: __m256d) -> __m256d {
    let mov = a.as_f64x4();
    let zero = _mm256_setzero_pd().as_f64x4();
    transmute(simd_select_bitmask(k, mov, zero))
}

/// Move packed double-precision (64-bit) floating-point elements from a to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mov_pd&expand=3815)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmovapd))]
pub unsafe fn _mm_mask_mov_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
    let mov = a.as_f64x2();
    transmute(simd_select_bitmask(k, mov, src.as_f64x2()))
}

/// Move packed double-precision (64-bit) floating-point elements from a into dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mov_pd&expand=3816)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmovapd))]
pub unsafe fn _mm_maskz_mov_pd(k: __mmask8, a: __m128d) -> __m128d {
    let mov = a.as_f64x2();
    let zero = _mm_setzero_pd().as_f64x2();
    transmute(simd_select_bitmask(k, mov, zero))
}

/// Add packed 32-bit integers in a and b, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_add_epi32&expand=100)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpaddd))]
pub unsafe fn _mm512_add_epi32(a: __m512i, b: __m512i) -> __m512i {
    transmute(simd_add(a.as_i32x16(), b.as_i32x16()))
}

/// Add packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_add_epi32&expand=101)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpaddd))]
pub unsafe fn _mm512_mask_add_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
    let add = _mm512_add_epi32(a, b).as_i32x16();
    transmute(simd_select_bitmask(k, add, src.as_i32x16()))
}

/// Add packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_add_epi32&expand=102)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpaddd))]
pub unsafe fn _mm512_maskz_add_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
    let add = _mm512_add_epi32(a, b).as_i32x16();
    let zero = _mm512_setzero_si512().as_i32x16();
    transmute(simd_select_bitmask(k, add, zero))
}

/// Add packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_add_epi32&expand=98)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpaddd))]
pub unsafe fn _mm256_mask_add_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
    let add = _mm256_add_epi32(a, b).as_i32x8();
    transmute(simd_select_bitmask(k, add, src.as_i32x8()))
}

/// Add packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_add_epi32&expand=99)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpaddd))]
pub unsafe fn _mm256_maskz_add_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
    let add = _mm256_add_epi32(a, b).as_i32x8();
    let zero = _mm256_setzero_si256().as_i32x8();
    transmute(simd_select_bitmask(k, add, zero))
}

/// Add packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_epi32&expand=95)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpaddd))]
pub unsafe fn _mm_mask_add_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    let add = _mm_add_epi32(a, b).as_i32x4();
    transmute(simd_select_bitmask(k, add, src.as_i32x4()))
}

/// Add packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_epi32&expand=96)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpaddd))]
pub unsafe fn _mm_maskz_add_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    let add = _mm_add_epi32(a, b).as_i32x4();
    let zero = _mm_setzero_si128().as_i32x4();
    transmute(simd_select_bitmask(k, add, zero))
}

/// Add packed 64-bit integers in a and b, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_add_epi64&expand=109)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpaddq))]
pub unsafe fn _mm512_add_epi64(a: __m512i, b: __m512i) -> __m512i {
    transmute(simd_add(a.as_i64x8(), b.as_i64x8()))
}

/// Add packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_add_epi64&expand=110)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpaddq))]
pub unsafe fn _mm512_mask_add_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
    let add = _mm512_add_epi64(a, b).as_i64x8();
    transmute(simd_select_bitmask(k, add, src.as_i64x8()))
}

/// Add packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_add_epi64&expand=111)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpaddq))]
pub unsafe fn _mm512_maskz_add_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
    let add = _mm512_add_epi64(a, b).as_i64x8();
    let zero = _mm512_setzero_si512().as_i64x8();
    transmute(simd_select_bitmask(k, add, zero))
}

/// Add packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_add_epi64&expand=107)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpaddq))]
pub unsafe fn _mm256_mask_add_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
    let add = _mm256_add_epi64(a, b).as_i64x4();
    transmute(simd_select_bitmask(k, add, src.as_i64x4()))
}

/// Add packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_add_epi64&expand=108)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpaddq))]
pub unsafe fn _mm256_maskz_add_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
    let add = _mm256_add_epi64(a, b).as_i64x4();
    let zero = _mm256_setzero_si256().as_i64x4();
    transmute(simd_select_bitmask(k, add, zero))
}

/// Add packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_epi64&expand=104)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpaddq))]
pub unsafe fn _mm_mask_add_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    let add = _mm_add_epi64(a, b).as_i64x2();
    transmute(simd_select_bitmask(k, add, src.as_i64x2()))
}

/// Add packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_epi64&expand=105)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpaddq))]
pub unsafe fn _mm_maskz_add_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    let add = _mm_add_epi64(a, b).as_i64x2();
    let zero = _mm_setzero_si128().as_i64x2();
    transmute(simd_select_bitmask(k, add, zero))
}

/// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_add_ps&expand=139)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vaddps))]
pub unsafe fn _mm512_add_ps(a: __m512, b: __m512) -> __m512 {
    transmute(simd_add(a.as_f32x16(), b.as_f32x16()))
}

/// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_add_ps&expand=140)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vaddps))]
pub unsafe fn _mm512_mask_add_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
    let add = _mm512_add_ps(a, b).as_f32x16();
    transmute(simd_select_bitmask(k, add, src.as_f32x16()))
}

/// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_add_ps&expand=141)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vaddps))]
pub unsafe fn _mm512_maskz_add_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
    let add = _mm512_add_ps(a, b).as_f32x16();
    let zero = _mm512_setzero_ps().as_f32x16();
    transmute(simd_select_bitmask(k, add, zero))
}

/// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_add_ps&expand=137)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vaddps))]
pub unsafe fn _mm256_mask_add_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
    let add = _mm256_add_ps(a, b).as_f32x8();
    transmute(simd_select_bitmask(k, add, src.as_f32x8()))
}

/// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_add_ps&expand=138)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vaddps))]
pub unsafe fn _mm256_maskz_add_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
    let add = _mm256_add_ps(a, b).as_f32x8();
    let zero = _mm256_setzero_ps().as_f32x8();
    transmute(simd_select_bitmask(k, add, zero))
}

/// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_ps&expand=134)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vaddps))]
pub unsafe fn _mm_mask_add_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
    let add = _mm_add_ps(a, b).as_f32x4();
    transmute(simd_select_bitmask(k, add, src.as_f32x4()))
}

/// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_ps&expand=135)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vaddps))]
pub unsafe fn _mm_maskz_add_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
    let add = _mm_add_ps(a, b).as_f32x4();
    let zero = _mm_setzero_ps().as_f32x4();
    transmute(simd_select_bitmask(k, add, zero))
}

/// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_add_pd&expand=127)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vaddpd))]
pub unsafe fn _mm512_add_pd(a: __m512d, b: __m512d) -> __m512d {
    transmute(simd_add(a.as_f64x8(), b.as_f64x8()))
}

/// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_add_pd&expand=128)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vaddpd))]
pub unsafe fn _mm512_mask_add_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
    let add = _mm512_add_pd(a, b).as_f64x8();
    transmute(simd_select_bitmask(k, add, src.as_f64x8()))
}

/// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_add_pd&expand=129)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vaddpd))]
pub unsafe fn _mm512_maskz_add_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
    let add = _mm512_add_pd(a, b).as_f64x8();
    let zero = _mm512_setzero_pd().as_f64x8();
    transmute(simd_select_bitmask(k, add, zero))
}

/// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_add_pd&expand=125)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vaddpd))]
pub unsafe fn _mm256_mask_add_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
    let add = _mm256_add_pd(a, b).as_f64x4();
    transmute(simd_select_bitmask(k, add, src.as_f64x4()))
}

/// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_add_pd&expand=126)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vaddpd))]
pub unsafe fn _mm256_maskz_add_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
    let add = _mm256_add_pd(a, b).as_f64x4();
    let zero = _mm256_setzero_pd().as_f64x4();
    transmute(simd_select_bitmask(k, add, zero))
}

/// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_add_pd&expand=122)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vaddpd))]
pub unsafe fn _mm_mask_add_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
    let add = _mm_add_pd(a, b).as_f64x2();
    transmute(simd_select_bitmask(k, add, src.as_f64x2()))
}

/// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_pd&expand=123)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vaddpd))]
pub unsafe fn _mm_maskz_add_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
    let add = _mm_add_pd(a, b).as_f64x2();
    let zero = _mm_setzero_pd().as_f64x2();
    transmute(simd_select_bitmask(k, add, zero))
}

/// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sub_epi32&expand=5694)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpsubd))]
pub unsafe fn _mm512_sub_epi32(a: __m512i, b: __m512i) -> __m512i {
    transmute(simd_sub(a.as_i32x16(), b.as_i32x16()))
}

/// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sub_epi32&expand=5692)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpsubd))]
pub unsafe fn _mm512_mask_sub_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
    let sub = _mm512_sub_epi32(a, b).as_i32x16();
    transmute(simd_select_bitmask(k, sub, src.as_i32x16()))
}

/// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sub_epi32&expand=5693)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpsubd))]
pub unsafe fn _mm512_maskz_sub_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
    let sub = _mm512_sub_epi32(a, b).as_i32x16();
    let zero = _mm512_setzero_si512().as_i32x16();
    transmute(simd_select_bitmask(k, sub, zero))
}

/// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_sub_epi32&expand=5689)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsubd))]
pub unsafe fn _mm256_mask_sub_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
    let sub = _mm256_sub_epi32(a, b).as_i32x8();
    transmute(simd_select_bitmask(k, sub, src.as_i32x8()))
}

/// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_sub_epi32&expand=5690)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsubd))]
pub unsafe fn _mm256_maskz_sub_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
    let sub = _mm256_sub_epi32(a, b).as_i32x8();
    let zero = _mm256_setzero_si256().as_i32x8();
    transmute(simd_select_bitmask(k, sub, zero))
}

/// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_epi32&expand=5686)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsubd))]
pub unsafe fn _mm_mask_sub_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    let sub = _mm_sub_epi32(a, b).as_i32x4();
    transmute(simd_select_bitmask(k, sub, src.as_i32x4()))
}

/// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_epi32&expand=5687)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsubd))]
pub unsafe fn _mm_maskz_sub_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    let sub = _mm_sub_epi32(a, b).as_i32x4();
    let zero = _mm_setzero_si128().as_i32x4();
    transmute(simd_select_bitmask(k, sub, zero))
}

/// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sub_epi64&expand=5703)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpsubq))]
pub unsafe fn _mm512_sub_epi64(a: __m512i, b: __m512i) -> __m512i {
    transmute(simd_sub(a.as_i64x8(), b.as_i64x8()))
}

/// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sub_epi64&expand=5701)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpsubq))]
pub unsafe fn _mm512_mask_sub_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
    let sub = _mm512_sub_epi64(a, b).as_i64x8();
    transmute(simd_select_bitmask(k, sub, src.as_i64x8()))
}

/// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sub_epi64&expand=5702)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpsubq))]
pub unsafe fn _mm512_maskz_sub_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
    let sub = _mm512_sub_epi64(a, b).as_i64x8();
    let zero = _mm512_setzero_si512().as_i64x8();
    transmute(simd_select_bitmask(k, sub, zero))
}

/// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_sub_epi64&expand=5698)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsubq))]
pub unsafe fn _mm256_mask_sub_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
    let sub = _mm256_sub_epi64(a, b).as_i64x4();
    transmute(simd_select_bitmask(k, sub, src.as_i64x4()))
}

/// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_sub_epi64&expand=5699)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsubq))]
pub unsafe fn _mm256_maskz_sub_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
    let sub = _mm256_sub_epi64(a, b).as_i64x4();
    let zero = _mm256_setzero_si256().as_i64x4();
    transmute(simd_select_bitmask(k, sub, zero))
}

/// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_epi64&expand=5695)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsubq))]
pub unsafe fn _mm_mask_sub_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    let sub = _mm_sub_epi64(a, b).as_i64x2();
    transmute(simd_select_bitmask(k, sub, src.as_i64x2()))
}

/// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_epi64&expand=5696)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsubq))]
pub unsafe fn _mm_maskz_sub_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    let sub = _mm_sub_epi64(a, b).as_i64x2();
    let zero = _mm_setzero_si128().as_i64x2();
    transmute(simd_select_bitmask(k, sub, zero))
}

/// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sub_ps&expand=5733)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vsubps))]
pub unsafe fn _mm512_sub_ps(a: __m512, b: __m512) -> __m512 {
    transmute(simd_sub(a.as_f32x16(), b.as_f32x16()))
}

/// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sub_ps&expand=5731)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vsubps))]
pub unsafe fn _mm512_mask_sub_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
    let sub = _mm512_sub_ps(a, b).as_f32x16();
    transmute(simd_select_bitmask(k, sub, src.as_f32x16()))
}

/// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sub_ps&expand=5732)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vsubps))]
pub unsafe fn _mm512_maskz_sub_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
    let sub = _mm512_sub_ps(a, b).as_f32x16();
    let zero = _mm512_setzero_ps().as_f32x16();
    transmute(simd_select_bitmask(k, sub, zero))
}

/// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_sub_ps&expand=5728)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vsubps))]
pub unsafe fn _mm256_mask_sub_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
    let sub = _mm256_sub_ps(a, b).as_f32x8();
    transmute(simd_select_bitmask(k, sub, src.as_f32x8()))
}

/// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_sub_ps&expand=5729)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vsubps))]
pub unsafe fn _mm256_maskz_sub_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
    let sub = _mm256_sub_ps(a, b).as_f32x8();
    let zero = _mm256_setzero_ps().as_f32x8();
    transmute(simd_select_bitmask(k, sub, zero))
}

/// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_ps&expand=5725)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vsubps))]
pub unsafe fn _mm_mask_sub_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
    let sub = _mm_sub_ps(a, b).as_f32x4();
    transmute(simd_select_bitmask(k, sub, src.as_f32x4()))
}

/// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_ps&expand=5726)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vsubps))]
pub unsafe fn _mm_maskz_sub_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
    let sub = _mm_sub_ps(a, b).as_f32x4();
    let zero = _mm_setzero_ps().as_f32x4();
    transmute(simd_select_bitmask(k, sub, zero))
}

/// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sub_pd&expand=5721)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vsubpd))]
pub unsafe fn _mm512_sub_pd(a: __m512d, b: __m512d) -> __m512d {
    transmute(simd_sub(a.as_f64x8(), b.as_f64x8()))
}

/// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sub_pd&expand=5719)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vsubpd))]
pub unsafe fn _mm512_mask_sub_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
    let sub = _mm512_sub_pd(a, b).as_f64x8();
    transmute(simd_select_bitmask(k, sub, src.as_f64x8()))
}

/// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sub_pd&expand=5720)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vsubpd))]
pub unsafe fn _mm512_maskz_sub_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
    let sub = _mm512_sub_pd(a, b).as_f64x8();
    let zero = _mm512_setzero_pd().as_f64x8();
    transmute(simd_select_bitmask(k, sub, zero))
}

/// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_sub_pd&expand=5716)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vsubpd))]
pub unsafe fn _mm256_mask_sub_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
    let sub = _mm256_sub_pd(a, b).as_f64x4();
    transmute(simd_select_bitmask(k, sub, src.as_f64x4()))
}

/// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_sub_pd&expand=5717)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vsubpd))]
pub unsafe fn _mm256_maskz_sub_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
    let sub = _mm256_sub_pd(a, b).as_f64x4();
    let zero = _mm256_setzero_pd().as_f64x4();
    transmute(simd_select_bitmask(k, sub, zero))
}

/// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sub_pd&expand=5713)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vsubpd))]
pub unsafe fn _mm_mask_sub_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
    let sub = _mm_sub_pd(a, b).as_f64x2();
    transmute(simd_select_bitmask(k, sub, src.as_f64x2()))
}

/// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sub_pd&expand=5714)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vsubpd))]
pub unsafe fn _mm_maskz_sub_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
    let sub = _mm_sub_pd(a, b).as_f64x2();
    let zero = _mm_setzero_pd().as_f64x2();
    transmute(simd_select_bitmask(k, sub, zero))
}

/// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_epi32&expand=3907)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmuldq))]
pub unsafe fn _mm512_mul_epi32(a: __m512i, b: __m512i) -> __m512i {
    transmute(vpmuldq(a.as_i32x16(), b.as_i32x16()))
}

/// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_epi32&expand=3905)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmuldq))]
pub unsafe fn _mm512_mask_mul_epi32(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
    let mul = _mm512_mul_epi32(a, b).as_i64x8();
    transmute(simd_select_bitmask(k, mul, src.as_i64x8()))
}

/// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_epi32&expand=3906)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmuldq))]
pub unsafe fn _mm512_maskz_mul_epi32(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
    let mul = _mm512_mul_epi32(a, b).as_i64x8();
    let zero = _mm512_setzero_si512().as_i64x8();
    transmute(simd_select_bitmask(k, mul, zero))
}

/// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_mul_epi32&expand=3902)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmuldq))]
pub unsafe fn _mm256_mask_mul_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
    let mul = _mm256_mul_epi32(a, b).as_i64x4();
    transmute(simd_select_bitmask(k, mul, src.as_i64x4()))
}

/// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_mul_epi32&expand=3903)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmuldq))]
pub unsafe fn _mm256_maskz_mul_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
    let mul = _mm256_mul_epi32(a, b).as_i64x4();
    let zero = _mm256_setzero_si256().as_i64x4();
    transmute(simd_select_bitmask(k, mul, zero))
}

/// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_epi32&expand=3899)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmuldq))]
pub unsafe fn _mm_mask_mul_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    let mul = _mm_mul_epi32(a, b).as_i64x2();
    transmute(simd_select_bitmask(k, mul, src.as_i64x2()))
}

/// Multiply the low signed 32-bit integers from each packed 64-bit element in a and b, and store the signed 64-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_epi32&expand=3900)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmuldq))]
pub unsafe fn _mm_maskz_mul_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    let mul = _mm_mul_epi32(a, b).as_i64x2();
    let zero = _mm_setzero_si128().as_i64x2();
    transmute(simd_select_bitmask(k, mul, zero))
}

/// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mullo_epi&expand=4005)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmulld))]
pub unsafe fn _mm512_mullo_epi32(a: __m512i, b: __m512i) -> __m512i {
    transmute(simd_mul(a.as_i32x16(), b.as_i32x16()))
}

/// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mullo_epi32&expand=4003)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmulld))]
pub unsafe fn _mm512_mask_mullo_epi32(
    src: __m512i,
    k: __mmask16,
    a: __m512i,
    b: __m512i,
) -> __m512i {
    let mul = _mm512_mullo_epi32(a, b).as_i32x16();
    transmute(simd_select_bitmask(k, mul, src.as_i32x16()))
}

/// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mullo_epi32&expand=4004)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmulld))]
pub unsafe fn _mm512_maskz_mullo_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
    let mul = _mm512_mullo_epi32(a, b).as_i32x16();
    let zero = _mm512_setzero_si512().as_i32x16();
    transmute(simd_select_bitmask(k, mul, zero))
}

/// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_mullo_epi32&expand=4000)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmulld))]
pub unsafe fn _mm256_mask_mullo_epi32(
    src: __m256i,
    k: __mmask8,
    a: __m256i,
    b: __m256i,
) -> __m256i {
    let mul = _mm256_mullo_epi32(a, b).as_i32x8();
    transmute(simd_select_bitmask(k, mul, src.as_i32x8()))
}

/// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_mullo_epi32&expand=4001)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmulld))]
pub unsafe fn _mm256_maskz_mullo_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
    let mul = _mm256_mullo_epi32(a, b).as_i32x8();
    let zero = _mm256_setzero_si256().as_i32x8();
    transmute(simd_select_bitmask(k, mul, zero))
}

/// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mullo_epi32&expand=3997)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmulld))]
pub unsafe fn _mm_mask_mullo_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    let mul = _mm_mullo_epi32(a, b).as_i32x4();
    transmute(simd_select_bitmask(k, mul, src.as_i32x4()))
}

/// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mullo_epi32&expand=3998)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmulld))]
pub unsafe fn _mm_maskz_mullo_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    let mul = _mm_mullo_epi32(a, b).as_i32x4();
    let zero = _mm_setzero_si128().as_i32x4();
    transmute(simd_select_bitmask(k, mul, zero))
}

/// Multiplies elements in packed 64-bit integer vectors a and b together, storing the lower 64 bits of the result in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mullox_epi64&expand=4017)
///
/// This intrinsic generates a sequence of instructions, which may perform worse than a native instruction. Consider the performance impact of this intrinsic.
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_mullox_epi64(a: __m512i, b: __m512i) -> __m512i {
    transmute(simd_mul(a.as_i64x8(), b.as_i64x8()))
}

/// Multiplies elements in packed 64-bit integer vectors a and b together, storing the lower 64 bits of the result in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_mullox&expand=4016)
///
/// This intrinsic generates a sequence of instructions, which may perform worse than a native instruction. Consider the performance impact of this intrinsic.
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_mask_mullox_epi64(
    src: __m512i,
    k: __mmask8,
    a: __m512i,
    b: __m512i,
) -> __m512i {
    let mul = _mm512_mullox_epi64(a, b).as_i64x8();
    transmute(simd_select_bitmask(k, mul, src.as_i64x8()))
}

/// Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mul_epu32&expand=3916)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmuludq))]
pub unsafe fn _mm512_mul_epu32(a: __m512i, b: __m512i) -> __m512i {
    transmute(vpmuludq(a.as_u32x16(), b.as_u32x16()))
}

/// Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_mul_epu32&expand=3914)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmuludq))]
pub unsafe fn _mm512_mask_mul_epu32(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
    let mul = _mm512_mul_epu32(a, b).as_u64x8();
    transmute(simd_select_bitmask(k, mul, src.as_u64x8()))
}

/// Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_mul_epu32&expand=3915)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmuludq))]
pub unsafe fn _mm512_maskz_mul_epu32(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
    let mul = _mm512_mul_epu32(a, b).as_u64x8();
    let zero = _mm512_setzero_si512().as_u64x8();
    transmute(simd_select_bitmask(k, mul, zero))
}

/// Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_mul_epu32&expand=3911)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmuludq))]
pub unsafe fn _mm256_mask_mul_epu32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
    let mul = _mm256_mul_epu32(a, b).as_u64x4();
    transmute(simd_select_bitmask(k, mul, src.as_u64x4()))
}

/// Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_mul_epu32&expand=3912)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmuludq))]
pub unsafe fn _mm256_maskz_mul_epu32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
    let mul = _mm256_mul_epu32(a, b).as_u64x4();
    let zero = _mm256_setzero_si256().as_u64x4();
    transmute(simd_select_bitmask(k, mul, zero))
}

/// Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_epu32&expand=3908)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmuludq))]
pub unsafe fn _mm_mask_mul_epu32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    let mul = _mm_mul_epu32(a, b).as_u64x2();
    transmute(simd_select_bitmask(k, mul, src.as_u64x2()))
}

/// Multiply the low unsigned 32-bit integers from each packed 64-bit element in a and b, and store the unsigned 64-bit results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_epu32&expand=3909)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmuludq))]
pub unsafe fn _mm_maskz_mul_epu32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    let mul = _mm_mul_epu32(a, b).as_u64x2();
    let zero = _mm_setzero_si128().as_u64x2();
    transmute(simd_select_bitmask(k, mul, zero))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_ps&expand=3934)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmulps))]
pub unsafe fn _mm512_mul_ps(a: __m512, b: __m512) -> __m512 {
    transmute(simd_mul(a.as_f32x16(), b.as_f32x16()))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_ps&expand=3932)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmulps))]
pub unsafe fn _mm512_mask_mul_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
    let mul = _mm512_mul_ps(a, b).as_f32x16();
    transmute(simd_select_bitmask(k, mul, src.as_f32x16()))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_ps&expand=3933)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmulps))]
pub unsafe fn _mm512_maskz_mul_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
    let mul = _mm512_mul_ps(a, b).as_f32x16();
    let zero = _mm512_setzero_ps().as_f32x16();
    transmute(simd_select_bitmask(k, mul, zero))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_mul_ps&expand=3929)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmulps))]
pub unsafe fn _mm256_mask_mul_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
    let mul = _mm256_mul_ps(a, b).as_f32x8();
    transmute(simd_select_bitmask(k, mul, src.as_f32x8()))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_mul_ps&expand=3930)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmulps))]
pub unsafe fn _mm256_maskz_mul_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
    let mul = _mm256_mul_ps(a, b).as_f32x8();
    let zero = _mm256_setzero_ps().as_f32x8();
    transmute(simd_select_bitmask(k, mul, zero))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_ps&expand=3926)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmulps))]
pub unsafe fn _mm_mask_mul_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
    let mul = _mm_mul_ps(a, b).as_f32x4();
    transmute(simd_select_bitmask(k, mul, src.as_f32x4()))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_ps&expand=3927)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmulps))]
pub unsafe fn _mm_maskz_mul_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
    let mul = _mm_mul_ps(a, b).as_f32x4();
    let zero = _mm_setzero_ps().as_f32x4();
    transmute(simd_select_bitmask(k, mul, zero))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_pd&expand=3925)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmulpd))]
pub unsafe fn _mm512_mul_pd(a: __m512d, b: __m512d) -> __m512d {
    transmute(simd_mul(a.as_f64x8(), b.as_f64x8()))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_pd&expand=3923)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmulpd))]
pub unsafe fn _mm512_mask_mul_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
    let mul = _mm512_mul_pd(a, b).as_f64x8();
    transmute(simd_select_bitmask(k, mul, src.as_f64x8()))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_pd&expand=3924)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmulpd))]
pub unsafe fn _mm512_maskz_mul_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
    let mul = _mm512_mul_pd(a, b).as_f64x8();
    let zero = _mm512_setzero_pd().as_f64x8();
    transmute(simd_select_bitmask(k, mul, zero))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_mul_pd&expand=3920)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmulpd))]
pub unsafe fn _mm256_mask_mul_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
    let mul = _mm256_mul_pd(a, b).as_f64x4();
    transmute(simd_select_bitmask(k, mul, src.as_f64x4()))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_mul_pd&expand=3921)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmulpd))]
pub unsafe fn _mm256_maskz_mul_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
    let mul = _mm256_mul_pd(a, b).as_f64x4();
    let zero = _mm256_setzero_pd().as_f64x4();
    transmute(simd_select_bitmask(k, mul, zero))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_mul_pd&expand=3917)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmulpd))]
pub unsafe fn _mm_mask_mul_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
    let mul = _mm_mul_pd(a, b).as_f64x2();
    transmute(simd_select_bitmask(k, mul, src.as_f64x2()))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_mul_pd&expand=3918)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmulpd))]
pub unsafe fn _mm_maskz_mul_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
    let mul = _mm_mul_pd(a, b).as_f64x2();
    let zero = _mm_setzero_pd().as_f64x2();
    transmute(simd_select_bitmask(k, mul, zero))
}

/// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_div_ps&expand=2162)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vdivps))]
pub unsafe fn _mm512_div_ps(a: __m512, b: __m512) -> __m512 {
    transmute(simd_div(a.as_f32x16(), b.as_f32x16()))
}

/// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_div_ps&expand=2163)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vdivps))]
pub unsafe fn _mm512_mask_div_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
    let div = _mm512_div_ps(a, b).as_f32x16();
    transmute(simd_select_bitmask(k, div, src.as_f32x16()))
}

/// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_div_ps&expand=2164)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vdivps))]
pub unsafe fn _mm512_maskz_div_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
    let div = _mm512_div_ps(a, b).as_f32x16();
    let zero = _mm512_setzero_ps().as_f32x16();
    transmute(simd_select_bitmask(k, div, zero))
}

/// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_div_ps&expand=2160)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vdivps))]
pub unsafe fn _mm256_mask_div_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
    let div = _mm256_div_ps(a, b).as_f32x8();
    transmute(simd_select_bitmask(k, div, src.as_f32x8()))
}

/// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_div_ps&expand=2161)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vdivps))]
pub unsafe fn _mm256_maskz_div_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
    let div = _mm256_div_ps(a, b).as_f32x8();
    let zero = _mm256_setzero_ps().as_f32x8();
    transmute(simd_select_bitmask(k, div, zero))
}

/// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_ps&expand=2157)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vdivps))]
pub unsafe fn _mm_mask_div_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
    let div = _mm_div_ps(a, b).as_f32x4();
    transmute(simd_select_bitmask(k, div, src.as_f32x4()))
}

/// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_ps&expand=2158)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vdivps))]
pub unsafe fn _mm_maskz_div_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
    let div = _mm_div_ps(a, b).as_f32x4();
    let zero = _mm_setzero_ps().as_f32x4();
    transmute(simd_select_bitmask(k, div, zero))
}

/// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_div_pd&expand=2153)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vdivpd))]
pub unsafe fn _mm512_div_pd(a: __m512d, b: __m512d) -> __m512d {
    transmute(simd_div(a.as_f64x8(), b.as_f64x8()))
}

/// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_div_pd&expand=2154)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vdivpd))]
pub unsafe fn _mm512_mask_div_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
    let div = _mm512_div_pd(a, b).as_f64x8();
    transmute(simd_select_bitmask(k, div, src.as_f64x8()))
}

/// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_div_pd&expand=2155)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vdivpd))]
pub unsafe fn _mm512_maskz_div_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
    let div = _mm512_div_pd(a, b).as_f64x8();
    let zero = _mm512_setzero_pd().as_f64x8();
    transmute(simd_select_bitmask(k, div, zero))
}

/// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_div_pd&expand=2151)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vdivpd))]
pub unsafe fn _mm256_mask_div_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
    let div = _mm256_div_pd(a, b).as_f64x4();
    transmute(simd_select_bitmask(k, div, src.as_f64x4()))
}

/// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_div_pd&expand=2152)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vdivpd))]
pub unsafe fn _mm256_maskz_div_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
    let div = _mm256_div_pd(a, b).as_f64x4();
    let zero = _mm256_setzero_pd().as_f64x4();
    transmute(simd_select_bitmask(k, div, zero))
}

/// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_div_pd&expand=2148)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vdivpd))]
pub unsafe fn _mm_mask_div_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
    let div = _mm_div_pd(a, b).as_f64x2();
    transmute(simd_select_bitmask(k, div, src.as_f64x2()))
}

/// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_div_pd&expand=2149)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vdivpd))]
pub unsafe fn _mm_maskz_div_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
    let div = _mm_div_pd(a, b).as_f64x2();
    let zero = _mm_setzero_pd().as_f64x2();
    transmute(simd_select_bitmask(k, div, zero))
}

/// Compare packed signed 32-bit integers in a and b, and store packed maximum values in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_max_epi32&expand=3582)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmaxsd))]
pub unsafe fn _mm512_max_epi32(a: __m512i, b: __m512i) -> __m512i {
    transmute(vpmaxsd(a.as_i32x16(), b.as_i32x16()))
}

/// Compare packed signed 32-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_max_epi32&expand=3580)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmaxsd))]
pub unsafe fn _mm512_mask_max_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
    let max = _mm512_max_epi32(a, b).as_i32x16();
    transmute(simd_select_bitmask(k, max, src.as_i32x16()))
}

/// Compare packed signed 32-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_max_epi32&expand=3581)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmaxsd))]
pub unsafe fn _mm512_maskz_max_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
    let max = _mm512_max_epi32(a, b).as_i32x16();
    let zero = _mm512_setzero_si512().as_i32x16();
    transmute(simd_select_bitmask(k, max, zero))
}

/// Compare packed signed 32-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_max_epi32&expand=3577)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmaxsd))]
pub unsafe fn _mm256_mask_max_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
    let max = _mm256_max_epi32(a, b).as_i32x8();
    transmute(simd_select_bitmask(k, max, src.as_i32x8()))
}

/// Compare packed signed 32-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_max_epi32&expand=3578)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmaxsd))]
pub unsafe fn _mm256_maskz_max_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
    let max = _mm256_max_epi32(a, b).as_i32x8();
    let zero = _mm256_setzero_si256().as_i32x8();
    transmute(simd_select_bitmask(k, max, zero))
}

/// Compare packed signed 32-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_epi32&expand=3574)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmaxsd))]
pub unsafe fn _mm_mask_max_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    let max = _mm_max_epi32(a, b).as_i32x4();
    transmute(simd_select_bitmask(k, max, src.as_i32x4()))
}

/// Compare packed signed 32-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_epi32&expand=3575)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmaxsd))]
pub unsafe fn _mm_maskz_max_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    let max = _mm_max_epi32(a, b).as_i32x4();
    let zero = _mm_setzero_si128().as_i32x4();
    transmute(simd_select_bitmask(k, max, zero))
}

/// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_max_epi64&expand=3591)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmaxsq))]
pub unsafe fn _mm512_max_epi64(a: __m512i, b: __m512i) -> __m512i {
    transmute(vpmaxsq(a.as_i64x8(), b.as_i64x8()))
}

/// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_max_epi64&expand=3589)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmaxsq))]
pub unsafe fn _mm512_mask_max_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
    let max = _mm512_max_epi64(a, b).as_i64x8();
    transmute(simd_select_bitmask(k, max, src.as_i64x8()))
}

/// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_max_epi64&expand=3590)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmaxsq))]
pub unsafe fn _mm512_maskz_max_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
    let max = _mm512_max_epi64(a, b).as_i64x8();
    let zero = _mm512_setzero_si512().as_i64x8();
    transmute(simd_select_bitmask(k, max, zero))
}

/// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epi64&expand=3588)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmaxsq))]
pub unsafe fn _mm256_max_epi64(a: __m256i, b: __m256i) -> __m256i {
    transmute(vpmaxsq256(a.as_i64x4(), b.as_i64x4()))
}

/// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_max_epi64&expand=3586)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmaxsq))]
pub unsafe fn _mm256_mask_max_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
    let max = _mm256_max_epi64(a, b).as_i64x4();
    transmute(simd_select_bitmask(k, max, src.as_i64x4()))
}

/// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_max_epi64&expand=3587)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmaxsq))]
pub unsafe fn _mm256_maskz_max_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
    let max = _mm256_max_epi64(a, b).as_i64x4();
    let zero = _mm256_setzero_si256().as_i64x4();
    transmute(simd_select_bitmask(k, max, zero))
}

/// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi64&expand=3585)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmaxsq))]
pub unsafe fn _mm_max_epi64(a: __m128i, b: __m128i) -> __m128i {
    transmute(vpmaxsq128(a.as_i64x2(), b.as_i64x2()))
}

/// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_epi64&expand=3583)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmaxsq))]
pub unsafe fn _mm_mask_max_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    let max = _mm_max_epi64(a, b).as_i64x2();
    transmute(simd_select_bitmask(k, max, src.as_i64x2()))
}

/// Compare packed signed 64-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_epi64&expand=3584)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmaxsq))]
pub unsafe fn _mm_maskz_max_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    let max = _mm_max_epi64(a, b).as_i64x2();
    let zero = _mm_setzero_si128().as_i64x2();
    transmute(simd_select_bitmask(k, max, zero))
}

/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_max_ps&expand=3655)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmaxps))]
pub unsafe fn _mm512_max_ps(a: __m512, b: __m512) -> __m512 {
    transmute(vmaxps(
        a.as_f32x16(),
        b.as_f32x16(),
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_max_ps&expand=3653)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmaxps))]
pub unsafe fn _mm512_mask_max_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
    let max = _mm512_max_ps(a, b).as_f32x16();
    transmute(simd_select_bitmask(k, max, src.as_f32x16()))
}

/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_max_ps&expand=3654)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmaxps))]
pub unsafe fn _mm512_maskz_max_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
    let max = _mm512_max_ps(a, b).as_f32x16();
    let zero = _mm512_setzero_ps().as_f32x16();
    transmute(simd_select_bitmask(k, max, zero))
}

/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_max_ps&expand=3650)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmaxps))]
pub unsafe fn _mm256_mask_max_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
    let max = _mm256_max_ps(a, b).as_f32x8();
    transmute(simd_select_bitmask(k, max, src.as_f32x8()))
}

/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_max_ps&expand=3651)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmaxps))]
pub unsafe fn _mm256_maskz_max_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
    let max = _mm256_max_ps(a, b).as_f32x8();
    let zero = _mm256_setzero_ps().as_f32x8();
    transmute(simd_select_bitmask(k, max, zero))
}

/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_ps&expand=3647)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmaxps))]
pub unsafe fn _mm_mask_max_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
    let max = _mm_max_ps(a, b).as_f32x4();
    transmute(simd_select_bitmask(k, max, src.as_f32x4()))
}

/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_ps&expand=3648)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmaxps))]
pub unsafe fn _mm_maskz_max_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
    let max = _mm_max_ps(a, b).as_f32x4();
    let zero = _mm_setzero_ps().as_f32x4();
    transmute(simd_select_bitmask(k, max, zero))
}

/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_max_pd&expand=3645)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmaxpd))]
pub unsafe fn _mm512_max_pd(a: __m512d, b: __m512d) -> __m512d {
    transmute(vmaxpd(a.as_f64x8(), b.as_f64x8(), _MM_FROUND_CUR_DIRECTION))
}

/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_max_pd&expand=3643)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmaxpd))]
pub unsafe fn _mm512_mask_max_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
    let max = _mm512_max_pd(a, b).as_f64x8();
    transmute(simd_select_bitmask(k, max, src.as_f64x8()))
}

/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_max_pd&expand=3644)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmaxpd))]
pub unsafe fn _mm512_maskz_max_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
    let max = _mm512_max_pd(a, b).as_f64x8();
    let zero = _mm512_setzero_pd().as_f64x8();
    transmute(simd_select_bitmask(k, max, zero))
}

/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_max_pd&expand=3640)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmaxpd))]
pub unsafe fn _mm256_mask_max_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
    let max = _mm256_max_pd(a, b).as_f64x4();
    transmute(simd_select_bitmask(k, max, src.as_f64x4()))
}

/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_max_pd&expand=3641)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmaxpd))]
pub unsafe fn _mm256_maskz_max_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
    let max = _mm256_max_pd(a, b).as_f64x4();
    let zero = _mm256_setzero_pd().as_f64x4();
    transmute(simd_select_bitmask(k, max, zero))
}

/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_pd&expand=3637)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmaxpd))]
pub unsafe fn _mm_mask_max_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
    let max = _mm_max_pd(a, b).as_f64x2();
    transmute(simd_select_bitmask(k, max, src.as_f64x2()))
}

/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_pd&expand=3638)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmaxpd))]
pub unsafe fn _mm_maskz_max_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
    let max = _mm_max_pd(a, b).as_f64x2();
    let zero = _mm_setzero_pd().as_f64x2();
    transmute(simd_select_bitmask(k, max, zero))
}

/// Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_max_epu32&expand=3618)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmaxud))]
pub unsafe fn _mm512_max_epu32(a: __m512i, b: __m512i) -> __m512i {
    transmute(vpmaxud(a.as_u32x16(), b.as_u32x16()))
}

/// Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_max_epu32&expand=3616)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmaxud))]
pub unsafe fn _mm512_mask_max_epu32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
    let max = _mm512_max_epu32(a, b).as_u32x16();
    transmute(simd_select_bitmask(k, max, src.as_u32x16()))
}

/// Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_max_epu32&expand=3617)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmaxud))]
pub unsafe fn _mm512_maskz_max_epu32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
    let max = _mm512_max_epu32(a, b).as_u32x16();
    let zero = _mm512_setzero_si512().as_u32x16();
    transmute(simd_select_bitmask(k, max, zero))
}

/// Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_max_epu32&expand=3613)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmaxud))]
pub unsafe fn _mm256_mask_max_epu32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
    let max = _mm256_max_epu32(a, b).as_u32x8();
    transmute(simd_select_bitmask(k, max, src.as_u32x8()))
}

/// Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_max_epu32&expand=3614)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmaxud))]
pub unsafe fn _mm256_maskz_max_epu32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
    let max = _mm256_max_epu32(a, b).as_u32x8();
    let zero = _mm256_setzero_si256().as_u32x8();
    transmute(simd_select_bitmask(k, max, zero))
}

/// Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_epu32&expand=3610)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmaxud))]
pub unsafe fn _mm_mask_max_epu32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    let max = _mm_max_epu32(a, b).as_u32x4();
    transmute(simd_select_bitmask(k, max, src.as_u32x4()))
}

/// Compare packed unsigned 32-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_epu32&expand=3611)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmaxud))]
pub unsafe fn _mm_maskz_max_epu32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    let max = _mm_max_epu32(a, b).as_u32x4();
    let zero = _mm_setzero_si128().as_u32x4();
    transmute(simd_select_bitmask(k, max, zero))
}

/// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_max_epu64&expand=3627)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmaxuq))]
pub unsafe fn _mm512_max_epu64(a: __m512i, b: __m512i) -> __m512i {
    transmute(vpmaxuq(a.as_u64x8(), b.as_u64x8()))
}

/// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_max_epu64&expand=3625)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmaxuq))]
pub unsafe fn _mm512_mask_max_epu64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
    let max = _mm512_max_epu64(a, b).as_u64x8();
    transmute(simd_select_bitmask(k, max, src.as_u64x8()))
}

/// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_max_epu&expand=3626)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmaxuq))]
pub unsafe fn _mm512_maskz_max_epu64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
    let max = _mm512_max_epu64(a, b).as_u64x8();
    let zero = _mm512_setzero_si512().as_u64x8();
    transmute(simd_select_bitmask(k, max, zero))
}

/// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_max_epu64&expand=3624)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmaxuq))]
pub unsafe fn _mm256_max_epu64(a: __m256i, b: __m256i) -> __m256i {
    transmute(vpmaxuq256(a.as_u64x4(), b.as_u64x4()))
}

/// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_max_epu64&expand=3622)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmaxuq))]
pub unsafe fn _mm256_mask_max_epu64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
    let max = _mm256_max_epu64(a, b).as_u64x4();
    transmute(simd_select_bitmask(k, max, src.as_u64x4()))
}

/// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_max_epu64&expand=3623)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmaxuq))]
pub unsafe fn _mm256_maskz_max_epu64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
    let max = _mm256_max_epu64(a, b).as_u64x4();
    let zero = _mm256_setzero_si256().as_u64x4();
    transmute(simd_select_bitmask(k, max, zero))
}

/// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu64&expand=3621)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmaxuq))]
pub unsafe fn _mm_max_epu64(a: __m128i, b: __m128i) -> __m128i {
    transmute(vpmaxuq128(a.as_u64x2(), b.as_u64x2()))
}

/// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_max_epu64&expand=3619)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmaxuq))]
pub unsafe fn _mm_mask_max_epu64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    let max = _mm_max_epu64(a, b).as_u64x2();
    transmute(simd_select_bitmask(k, max, src.as_u64x2()))
}

/// Compare packed unsigned 64-bit integers in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_max_epu64&expand=3620)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmaxuq))]
pub unsafe fn _mm_maskz_max_epu64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    let max = _mm_max_epu64(a, b).as_u64x2();
    let zero = _mm_setzero_si128().as_u64x2();
    transmute(simd_select_bitmask(k, max, zero))
}

/// Compare packed signed 32-bit integers in a and b, and store packed minimum values in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_min_epi32&expand=3696)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpminsd))]
pub unsafe fn _mm512_min_epi32(a: __m512i, b: __m512i) -> __m512i {
    transmute(vpminsd(a.as_i32x16(), b.as_i32x16()))
}

/// Compare packed signed 32-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_min_epi32&expand=3694)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpminsd))]
pub unsafe fn _mm512_mask_min_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
    let min = _mm512_min_epi32(a, b).as_i32x16();
    transmute(simd_select_bitmask(k, min, src.as_i32x16()))
}

/// Compare packed signed 32-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_min_epi32&expand=3695)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpminsd))]
pub unsafe fn _mm512_maskz_min_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
    let min = _mm512_min_epi32(a, b).as_i32x16();
    let zero = _mm512_setzero_si512().as_i32x16();
    transmute(simd_select_bitmask(k, min, zero))
}

/// Compare packed signed 32-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_min_epi32&expand=3691)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpminsd))]
pub unsafe fn _mm256_mask_min_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
    let min = _mm256_min_epi32(a, b).as_i32x8();
    transmute(simd_select_bitmask(k, min, src.as_i32x8()))
}

/// Compare packed signed 32-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_min_epi32&expand=3692)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpminsd))]
pub unsafe fn _mm256_maskz_min_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
    let min = _mm256_min_epi32(a, b).as_i32x8();
    let zero = _mm256_setzero_si256().as_i32x8();
    transmute(simd_select_bitmask(k, min, zero))
}

/// Compare packed signed 32-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_epi32&expand=3688)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpminsd))]
pub unsafe fn _mm_mask_min_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    let min = _mm_min_epi32(a, b).as_i32x4();
    transmute(simd_select_bitmask(k, min, src.as_i32x4()))
}

/// Compare packed signed 32-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_epi32&expand=3689)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpminsd))]
pub unsafe fn _mm_maskz_min_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    let min = _mm_min_epi32(a, b).as_i32x4();
    let zero = _mm_setzero_si128().as_i32x4();
    transmute(simd_select_bitmask(k, min, zero))
}

/// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_min_epi64&expand=3705)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpminsq))]
pub unsafe fn _mm512_min_epi64(a: __m512i, b: __m512i) -> __m512i {
    transmute(vpminsq(a.as_i64x8(), b.as_i64x8()))
}

/// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_min_epi64&expand=3703)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpminsq))]
pub unsafe fn _mm512_mask_min_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
    let min = _mm512_min_epi64(a, b).as_i64x8();
    transmute(simd_select_bitmask(k, min, src.as_i64x8()))
}

/// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_min_epi64&expand=3704)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpminsq))]
pub unsafe fn _mm512_maskz_min_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
    let min = _mm512_min_epi64(a, b).as_i64x8();
    let zero = _mm512_setzero_si512().as_i64x8();
    transmute(simd_select_bitmask(k, min, zero))
}

/// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epi64&expand=3702)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpminsq))]
pub unsafe fn _mm256_min_epi64(a: __m256i, b: __m256i) -> __m256i {
    transmute(vpminsq256(a.as_i64x4(), b.as_i64x4()))
}

/// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_min_epi64&expand=3700)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpminsq))]
pub unsafe fn _mm256_mask_min_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
    let min = _mm256_min_epi64(a, b).as_i64x4();
    transmute(simd_select_bitmask(k, min, src.as_i64x4()))
}

/// Compare packed signed 64-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_min_epi64&expand=3701)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpminsq))]
pub unsafe fn _mm256_maskz_min_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
    let min = _mm256_min_epi64(a, b).as_i64x4();
    let zero = _mm256_setzero_si256().as_i64x4();
    transmute(simd_select_bitmask(k, min, zero))
}

/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_min_ps&expand=3769)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vminps))]
pub unsafe fn _mm512_min_ps(a: __m512, b: __m512) -> __m512 {
    transmute(vminps(
        a.as_f32x16(),
        b.as_f32x16(),
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_min_ps&expand=3767)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vminps))]
pub unsafe fn _mm512_mask_min_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
    let min = _mm512_min_ps(a, b).as_f32x16();
    transmute(simd_select_bitmask(k, min, src.as_f32x16()))
}

/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_min_ps&expand=3768)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vminps))]
pub unsafe fn _mm512_maskz_min_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
    let min = _mm512_min_ps(a, b).as_f32x16();
    let zero = _mm512_setzero_ps().as_f32x16();
    transmute(simd_select_bitmask(k, min, zero))
}

/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_min_ps&expand=3764)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vminps))]
pub unsafe fn _mm256_mask_min_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
    let min = _mm256_min_ps(a, b).as_f32x8();
    transmute(simd_select_bitmask(k, min, src.as_f32x8()))
}

/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_min_ps&expand=3765)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vminps))]
pub unsafe fn _mm256_maskz_min_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
    let min = _mm256_min_ps(a, b).as_f32x8();
    let zero = _mm256_setzero_ps().as_f32x8();
    transmute(simd_select_bitmask(k, min, zero))
}

/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_ps&expand=3761)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vminps))]
pub unsafe fn _mm_mask_min_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
    let min = _mm_min_ps(a, b).as_f32x4();
    transmute(simd_select_bitmask(k, min, src.as_f32x4()))
}

/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_ps&expand=3762)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vminps))]
pub unsafe fn _mm_maskz_min_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
    let min = _mm_min_ps(a, b).as_f32x4();
    let zero = _mm_setzero_ps().as_f32x4();
    transmute(simd_select_bitmask(k, min, zero))
}

/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst.

/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_min_pd&expand=3759)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vminpd))]
pub unsafe fn _mm512_min_pd(a: __m512d, b: __m512d) -> __m512d {
    transmute(vminpd(a.as_f64x8(), b.as_f64x8(), _MM_FROUND_CUR_DIRECTION))
}

/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_min_pd&expand=3757)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vminpd))]
pub unsafe fn _mm512_mask_min_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
    let min = _mm512_min_pd(a, b).as_f64x8();
    transmute(simd_select_bitmask(k, min, src.as_f64x8()))
}

/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_min_pd&expand=3758)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vminpd))]
pub unsafe fn _mm512_maskz_min_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
    let min = _mm512_min_pd(a, b).as_f64x8();
    let zero = _mm512_setzero_pd().as_f64x8();
    transmute(simd_select_bitmask(k, min, zero))
}

/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_min_pd&expand=3754)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vminpd))]
pub unsafe fn _mm256_mask_min_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
    let min = _mm256_min_pd(a, b).as_f64x4();
    transmute(simd_select_bitmask(k, min, src.as_f64x4()))
}

/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_min_pd&expand=3755)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vminpd))]
pub unsafe fn _mm256_maskz_min_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
    let min = _mm256_min_pd(a, b).as_f64x4();
    let zero = _mm256_setzero_pd().as_f64x4();
    transmute(simd_select_bitmask(k, min, zero))
}

/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_pd&expand=3751)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vminpd))]
pub unsafe fn _mm_mask_min_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
    let min = _mm_min_pd(a, b).as_f64x2();
    transmute(simd_select_bitmask(k, min, src.as_f64x2()))
}

/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_pd&expand=3752)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vminpd))]
pub unsafe fn _mm_maskz_min_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
    let min = _mm_min_pd(a, b).as_f64x2();
    let zero = _mm_setzero_pd().as_f64x2();
    transmute(simd_select_bitmask(k, min, zero))
}

/// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_min_epu32&expand=3732)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpminud))]
pub unsafe fn _mm512_min_epu32(a: __m512i, b: __m512i) -> __m512i {
    transmute(vpminud(a.as_u32x16(), b.as_u32x16()))
}

/// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_min_epu32&expand=3730)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpminud))]
pub unsafe fn _mm512_mask_min_epu32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
    let min = _mm512_min_epu32(a, b).as_u32x16();
    transmute(simd_select_bitmask(k, min, src.as_u32x16()))
}

/// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_min_epu32&expand=3731)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpminud))]
pub unsafe fn _mm512_maskz_min_epu32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
    let min = _mm512_min_epu32(a, b).as_u32x16();
    let zero = _mm512_setzero_si512().as_u32x16();
    transmute(simd_select_bitmask(k, min, zero))
}

/// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_min_epu32&expand=3727)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpminud))]
pub unsafe fn _mm256_mask_min_epu32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
    let min = _mm256_min_epu32(a, b).as_u32x8();
    transmute(simd_select_bitmask(k, min, src.as_u32x8()))
}

/// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_min_epu32&expand=3728)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpminud))]
pub unsafe fn _mm256_maskz_min_epu32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
    let min = _mm256_min_epu32(a, b).as_u32x8();
    let zero = _mm256_setzero_si256().as_u32x8();
    transmute(simd_select_bitmask(k, min, zero))
}

/// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_epu32&expand=3724)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpminud))]
pub unsafe fn _mm_mask_min_epu32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    let min = _mm_min_epu32(a, b).as_u32x4();
    transmute(simd_select_bitmask(k, min, src.as_u32x4()))
}

/// Compare packed unsigned 32-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_epu32&expand=3725)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpminud))]
pub unsafe fn _mm_maskz_min_epu32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    let min = _mm_min_epu32(a, b).as_u32x4();
    let zero = _mm_setzero_si128().as_u32x4();
    transmute(simd_select_bitmask(k, min, zero))
}

/// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_min_epu64&expand=3741)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpminuq))]
pub unsafe fn _mm512_min_epu64(a: __m512i, b: __m512i) -> __m512i {
    transmute(vpminuq(a.as_u64x8(), b.as_u64x8()))
}

/// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_min_epu64&expand=3739)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpminuq))]
pub unsafe fn _mm512_mask_min_epu64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
    let min = _mm512_min_epu64(a, b).as_u64x8();
    transmute(simd_select_bitmask(k, min, src.as_u64x8()))
}

/// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_min_epu64&expand=3740)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpminuq))]
pub unsafe fn _mm512_maskz_min_epu64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
    let min = _mm512_min_epu64(a, b).as_u64x8();
    let zero = _mm512_setzero_si512().as_u64x8();
    transmute(simd_select_bitmask(k, min, zero))
}

/// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_min_epu64&expand=3738)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpminuq))]
pub unsafe fn _mm256_min_epu64(a: __m256i, b: __m256i) -> __m256i {
    transmute(vpminuq256(a.as_u64x4(), b.as_u64x4()))
}

/// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_min_epu64&expand=3736)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpminuq))]
pub unsafe fn _mm256_mask_min_epu64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
    let min = _mm256_min_epu64(a, b).as_u64x4();
    transmute(simd_select_bitmask(k, min, src.as_u64x4()))
}

/// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_min_epu64&expand=3737)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpminuq))]
pub unsafe fn _mm256_maskz_min_epu64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
    let min = _mm256_min_epu64(a, b).as_u64x4();
    let zero = _mm256_setzero_si256().as_u64x4();
    transmute(simd_select_bitmask(k, min, zero))
}

/// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu64&expand=3735)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpminuq))]
pub unsafe fn _mm_min_epu64(a: __m128i, b: __m128i) -> __m128i {
    transmute(vpminuq128(a.as_u64x2(), b.as_u64x2()))
}

/// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_min_epu64&expand=3733)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpminuq))]
pub unsafe fn _mm_mask_min_epu64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    let min = _mm_min_epu64(a, b).as_u64x2();
    transmute(simd_select_bitmask(k, min, src.as_u64x2()))
}

/// Compare packed unsigned 64-bit integers in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_min_epu64&expand=3734)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpminuq))]
pub unsafe fn _mm_maskz_min_epu64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    let min = _mm_min_epu64(a, b).as_u64x2();
    let zero = _mm_setzero_si128().as_u64x2();
    transmute(simd_select_bitmask(k, min, zero))
}

/// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sqrt_ps&expand=5371)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vsqrtps))]
pub unsafe fn _mm512_sqrt_ps(a: __m512) -> __m512 {
    transmute(vsqrtps(a.as_f32x16(), _MM_FROUND_CUR_DIRECTION))
}

/// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sqrt_ps&expand=5369)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vsqrtps))]
pub unsafe fn _mm512_mask_sqrt_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
    let sqrt = _mm512_sqrt_ps(a).as_f32x16();
    transmute(simd_select_bitmask(k, sqrt, src.as_f32x16()))
}

/// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sqrt_ps&expand=5370)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vsqrtps))]
pub unsafe fn _mm512_maskz_sqrt_ps(k: __mmask16, a: __m512) -> __m512 {
    let sqrt = _mm512_sqrt_ps(a).as_f32x16();
    let zero = _mm512_setzero_ps().as_f32x16();
    transmute(simd_select_bitmask(k, sqrt, zero))
}

/// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_sqrt_ps&expand=5366)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vsqrtps))]
pub unsafe fn _mm256_mask_sqrt_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
    let sqrt = _mm256_sqrt_ps(a).as_f32x8();
    transmute(simd_select_bitmask(k, sqrt, src.as_f32x8()))
}

/// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_sqrt_ps&expand=5367)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vsqrtps))]
pub unsafe fn _mm256_maskz_sqrt_ps(k: __mmask8, a: __m256) -> __m256 {
    let sqrt = _mm256_sqrt_ps(a).as_f32x8();
    let zero = _mm256_setzero_ps().as_f32x8();
    transmute(simd_select_bitmask(k, sqrt, zero))
}

/// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_ps&expand=5363)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vsqrtps))]
pub unsafe fn _mm_mask_sqrt_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
    let sqrt = _mm_sqrt_ps(a).as_f32x4();
    transmute(simd_select_bitmask(k, sqrt, src.as_f32x4()))
}

/// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_ps&expand=5364)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vsqrtps))]
pub unsafe fn _mm_maskz_sqrt_ps(k: __mmask8, a: __m128) -> __m128 {
    let sqrt = _mm_sqrt_ps(a).as_f32x4();
    let zero = _mm_setzero_ps().as_f32x4();
    transmute(simd_select_bitmask(k, sqrt, zero))
}

/// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sqrt_pd&expand=5362)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vsqrtpd))]
pub unsafe fn _mm512_sqrt_pd(a: __m512d) -> __m512d {
    transmute(vsqrtpd(a.as_f64x8(), _MM_FROUND_CUR_DIRECTION))
}

/// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sqrt_pd&expand=5360)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vsqrtpd))]
pub unsafe fn _mm512_mask_sqrt_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
    let sqrt = _mm512_sqrt_pd(a).as_f64x8();
    transmute(simd_select_bitmask(k, sqrt, src.as_f64x8()))
}

/// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sqrt_pd&expand=5361)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vsqrtpd))]
pub unsafe fn _mm512_maskz_sqrt_pd(k: __mmask8, a: __m512d) -> __m512d {
    let sqrt = _mm512_sqrt_pd(a).as_f64x8();
    let zero = _mm512_setzero_pd().as_f64x8();
    transmute(simd_select_bitmask(k, sqrt, zero))
}

/// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_sqrt_pd&expand=5357)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vsqrtpd))]
pub unsafe fn _mm256_mask_sqrt_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
    let sqrt = _mm256_sqrt_pd(a).as_f64x4();
    transmute(simd_select_bitmask(k, sqrt, src.as_f64x4()))
}

/// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_sqrt_pd&expand=5358)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vsqrtpd))]
pub unsafe fn _mm256_maskz_sqrt_pd(k: __mmask8, a: __m256d) -> __m256d {
    let sqrt = _mm256_sqrt_pd(a).as_f64x4();
    let zero = _mm256_setzero_pd().as_f64x4();
    transmute(simd_select_bitmask(k, sqrt, zero))
}

/// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sqrt_pd&expand=5354)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vsqrtpd))]
pub unsafe fn _mm_mask_sqrt_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
    let sqrt = _mm_sqrt_pd(a).as_f64x2();
    transmute(simd_select_bitmask(k, sqrt, src.as_f64x2()))
}

/// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sqrt_pd&expand=5355)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vsqrtpd))]
pub unsafe fn _mm_maskz_sqrt_pd(k: __mmask8, a: __m128d) -> __m128d {
    let sqrt = _mm_sqrt_pd(a).as_f64x2();
    let zero = _mm_setzero_pd().as_f64x2();
    transmute(simd_select_bitmask(k, sqrt, zero))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_ps&expand=2557)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
pub unsafe fn _mm512_fmadd_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
    transmute(vfmadd132ps(a.as_f32x16(), b.as_f32x16(), c.as_f32x16()))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_ps&expand=2558)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
pub unsafe fn _mm512_mask_fmadd_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 {
    let fmadd = _mm512_fmadd_ps(a, b, c).as_f32x16();
    transmute(simd_select_bitmask(k, fmadd, a.as_f32x16()))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_ps&expand=2560)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
pub unsafe fn _mm512_maskz_fmadd_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 {
    let fmadd = _mm512_fmadd_ps(a, b, c).as_f32x16();
    let zero = _mm512_setzero_ps().as_f32x16();
    transmute(simd_select_bitmask(k, fmadd, zero))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_ps&expand=2559)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
pub unsafe fn _mm512_mask3_fmadd_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 {
    let fmadd = _mm512_fmadd_ps(a, b, c).as_f32x16();
    transmute(simd_select_bitmask(k, fmadd, c.as_f32x16()))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmadd_ps&expand=2554)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
pub unsafe fn _mm256_mask_fmadd_ps(a: __m256, k: __mmask8, b: __m256, c: __m256) -> __m256 {
    let fmadd = _mm256_fmadd_ps(a, b, c).as_f32x8();
    transmute(simd_select_bitmask(k, fmadd, a.as_f32x8()))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmadd_ps&expand=2556)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
pub unsafe fn _mm256_maskz_fmadd_ps(k: __mmask8, a: __m256, b: __m256, c: __m256) -> __m256 {
    let fmadd = _mm256_fmadd_ps(a, b, c).as_f32x8();
    let zero = _mm256_setzero_ps().as_f32x8();
    transmute(simd_select_bitmask(k, fmadd, zero))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmadd_ps&expand=2555)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
pub unsafe fn _mm256_mask3_fmadd_ps(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256 {
    let fmadd = _mm256_fmadd_ps(a, b, c).as_f32x8();
    transmute(simd_select_bitmask(k, fmadd, c.as_f32x8()))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_ps&expand=2550)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
pub unsafe fn _mm_mask_fmadd_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
    let fmadd = _mm_fmadd_ps(a, b, c).as_f32x4();
    transmute(simd_select_bitmask(k, fmadd, a.as_f32x4()))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_ps&expand=2552)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
pub unsafe fn _mm_maskz_fmadd_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
    let fmadd = _mm_fmadd_ps(a, b, c).as_f32x4();
    let zero = _mm_setzero_ps().as_f32x4();
    transmute(simd_select_bitmask(k, fmadd, zero))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_ps&expand=2551)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
pub unsafe fn _mm_mask3_fmadd_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
    let fmadd = _mm_fmadd_ps(a, b, c).as_f32x4();
    transmute(simd_select_bitmask(k, fmadd, c.as_f32x4()))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_pd&expand=2545)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
pub unsafe fn _mm512_fmadd_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
    transmute(vfmadd132pd(a.as_f64x8(), b.as_f64x8(), c.as_f64x8()))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_pd&expand=2546)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
pub unsafe fn _mm512_mask_fmadd_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d {
    let fmadd = _mm512_fmadd_pd(a, b, c).as_f64x8();
    transmute(simd_select_bitmask(k, fmadd, a.as_f64x8()))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_pd&expand=2548)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
pub unsafe fn _mm512_maskz_fmadd_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d {
    let fmadd = _mm512_fmadd_pd(a, b, c).as_f64x8();
    let zero = _mm512_setzero_pd().as_f64x8();
    transmute(simd_select_bitmask(k, fmadd, zero))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_pd&expand=2547)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
pub unsafe fn _mm512_mask3_fmadd_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d {
    let fmadd = _mm512_fmadd_pd(a, b, c).as_f64x8();
    transmute(simd_select_bitmask(k, fmadd, c.as_f64x8()))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmadd_pd&expand=2542)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
pub unsafe fn _mm256_mask_fmadd_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m256d) -> __m256d {
    let fmadd = _mm256_fmadd_pd(a, b, c).as_f64x4();
    transmute(simd_select_bitmask(k, fmadd, a.as_f64x4()))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmadd_pd&expand=2544)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
pub unsafe fn _mm256_maskz_fmadd_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m256d) -> __m256d {
    let fmadd = _mm256_fmadd_pd(a, b, c).as_f64x4();
    let zero = _mm256_setzero_pd().as_f64x4();
    transmute(simd_select_bitmask(k, fmadd, zero))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmadd_pd&expand=2543)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
pub unsafe fn _mm256_mask3_fmadd_pd(a: __m256d, b: __m256d, c: __m256d, k: __mmask8) -> __m256d {
    let fmadd = _mm256_fmadd_pd(a, b, c).as_f64x4();
    transmute(simd_select_bitmask(k, fmadd, c.as_f64x4()))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmadd_pd&expand=2538)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
pub unsafe fn _mm_mask_fmadd_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
    let fmadd = _mm_fmadd_pd(a, b, c).as_f64x2();
    transmute(simd_select_bitmask(k, fmadd, a.as_f64x2()))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmadd_pd&expand=2540)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
pub unsafe fn _mm_maskz_fmadd_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
    let fmadd = _mm_fmadd_pd(a, b, c).as_f64x2();
    let zero = _mm_setzero_pd().as_f64x2();
    transmute(simd_select_bitmask(k, fmadd, zero))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmadd_pd&expand=2539)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfmadd))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
pub unsafe fn _mm_mask3_fmadd_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
    let fmadd = _mm_fmadd_pd(a, b, c).as_f64x2();
    transmute(simd_select_bitmask(k, fmadd, c.as_f64x2()))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsub_ps&expand=2643)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
pub unsafe fn _mm512_fmsub_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
    let zero: f32x16 = mem::zeroed();
    let sub = simd_sub(zero, c.as_f32x16());
    transmute(vfmadd132ps(a.as_f32x16(), b.as_f32x16(), sub))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsub_ps&expand=2644)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
pub unsafe fn _mm512_mask_fmsub_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 {
    let fmsub = _mm512_fmsub_ps(a, b, c).as_f32x16();
    transmute(simd_select_bitmask(k, fmsub, a.as_f32x16()))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsub_ps&expand=2646)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
pub unsafe fn _mm512_maskz_fmsub_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 {
    let fmsub = _mm512_fmsub_ps(a, b, c).as_f32x16();
    let zero = _mm512_setzero_ps().as_f32x16();
    transmute(simd_select_bitmask(k, fmsub, zero))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsub_ps&expand=2645)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
pub unsafe fn _mm512_mask3_fmsub_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 {
    let fmsub = _mm512_fmsub_ps(a, b, c).as_f32x16();
    transmute(simd_select_bitmask(k, fmsub, c.as_f32x16()))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmsub_ps&expand=2640)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
pub unsafe fn _mm256_mask_fmsub_ps(a: __m256, k: __mmask8, b: __m256, c: __m256) -> __m256 {
    let fmsub = _mm256_fmsub_ps(a, b, c).as_f32x8();
    transmute(simd_select_bitmask(k, fmsub, a.as_f32x8()))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmsub_ps&expand=2642)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
pub unsafe fn _mm256_maskz_fmsub_ps(k: __mmask8, a: __m256, b: __m256, c: __m256) -> __m256 {
    let fmsub = _mm256_fmsub_ps(a, b, c).as_f32x8();
    let zero = _mm256_setzero_ps().as_f32x8();
    transmute(simd_select_bitmask(k, fmsub, zero))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmsub_ps&expand=2641)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
pub unsafe fn _mm256_mask3_fmsub_ps(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256 {
    let fmsub = _mm256_fmsub_ps(a, b, c).as_f32x8();
    transmute(simd_select_bitmask(k, fmsub, c.as_f32x8()))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsub_ps&expand=2636)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
pub unsafe fn _mm_mask_fmsub_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
    let fmsub = _mm_fmsub_ps(a, b, c).as_f32x4();
    transmute(simd_select_bitmask(k, fmsub, a.as_f32x4()))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsub_ps&expand=2638)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
pub unsafe fn _mm_maskz_fmsub_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
    let fmsub = _mm_fmsub_ps(a, b, c).as_f32x4();
    let zero = _mm_setzero_ps().as_f32x4();
    transmute(simd_select_bitmask(k, fmsub, zero))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsub_ps&expand=2637)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generate vfmadd, gcc generate vfmsub
pub unsafe fn _mm_mask3_fmsub_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
    let fmsub = _mm_fmsub_ps(a, b, c).as_f32x4();
    transmute(simd_select_bitmask(k, fmsub, c.as_f32x4()))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsub_pd&expand=2631)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
pub unsafe fn _mm512_fmsub_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
    let zero: f64x8 = mem::zeroed();
    let sub = simd_sub(zero, c.as_f64x8());
    transmute(vfmadd132pd(a.as_f64x8(), b.as_f64x8(), sub))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsub_pd&expand=2632)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
pub unsafe fn _mm512_mask_fmsub_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d {
    let fmsub = _mm512_fmsub_pd(a, b, c).as_f64x8();
    transmute(simd_select_bitmask(k, fmsub, a.as_f64x8()))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsub_pd&expand=2634)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
pub unsafe fn _mm512_maskz_fmsub_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d {
    let fmsub = _mm512_fmsub_pd(a, b, c).as_f64x8();
    let zero = _mm512_setzero_pd().as_f64x8();
    transmute(simd_select_bitmask(k, fmsub, zero))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsub_pd&expand=2633)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
pub unsafe fn _mm512_mask3_fmsub_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d {
    let fmsub = _mm512_fmsub_pd(a, b, c).as_f64x8();
    transmute(simd_select_bitmask(k, fmsub, c.as_f64x8()))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmsub_pd&expand=2628)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
pub unsafe fn _mm256_mask_fmsub_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m256d) -> __m256d {
    let fmsub = _mm256_fmsub_pd(a, b, c).as_f64x4();
    transmute(simd_select_bitmask(k, fmsub, a.as_f64x4()))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmsub_pd&expand=2630)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
pub unsafe fn _mm256_maskz_fmsub_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m256d) -> __m256d {
    let fmsub = _mm256_fmsub_pd(a, b, c).as_f64x4();
    let zero = _mm256_setzero_pd().as_f64x4();
    transmute(simd_select_bitmask(k, fmsub, zero))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmsub_pd&expand=2629)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
pub unsafe fn _mm256_mask3_fmsub_pd(a: __m256d, b: __m256d, c: __m256d, k: __mmask8) -> __m256d {
    let fmsub = _mm256_fmsub_pd(a, b, c).as_f64x4();
    transmute(simd_select_bitmask(k, fmsub, c.as_f64x4()))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsub_pd&expand=2624)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
pub unsafe fn _mm_mask_fmsub_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
    let fmsub = _mm_fmsub_pd(a, b, c).as_f64x2();
    transmute(simd_select_bitmask(k, fmsub, a.as_f64x2()))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsub_pd&expand=2626)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
pub unsafe fn _mm_maskz_fmsub_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
    let fmsub = _mm_fmsub_pd(a, b, c).as_f64x2();
    let zero = _mm_setzero_pd().as_f64x2();
    transmute(simd_select_bitmask(k, fmsub, zero))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsub_pd&expand=2625)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfmsub))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang fmadd, gcc fmsub
pub unsafe fn _mm_mask3_fmsub_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
    let fmsub = _mm_fmsub_pd(a, b, c).as_f64x2();
    transmute(simd_select_bitmask(k, fmsub, c.as_f64x2()))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmaddsub_ps&expand=2611)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
pub unsafe fn _mm512_fmaddsub_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
    transmute(vfmaddsub213ps(
        a.as_f32x16(),
        b.as_f32x16(),
        c.as_f32x16(),
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmaddsub_ps&expand=2612)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
pub unsafe fn _mm512_mask_fmaddsub_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 {
    let fmaddsub = _mm512_fmaddsub_ps(a, b, c).as_f32x16();
    transmute(simd_select_bitmask(k, fmaddsub, a.as_f32x16()))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmaddsub_ps&expand=2614)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
pub unsafe fn _mm512_maskz_fmaddsub_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 {
    let fmaddsub = _mm512_fmaddsub_ps(a, b, c).as_f32x16();
    let zero = _mm512_setzero_ps().as_f32x16();
    transmute(simd_select_bitmask(k, fmaddsub, zero))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmaddsub_ps&expand=2613)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
pub unsafe fn _mm512_mask3_fmaddsub_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 {
    let fmaddsub = _mm512_fmaddsub_ps(a, b, c).as_f32x16();
    transmute(simd_select_bitmask(k, fmaddsub, c.as_f32x16()))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmaddsub_ps&expand=2608)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
pub unsafe fn _mm256_mask_fmaddsub_ps(a: __m256, k: __mmask8, b: __m256, c: __m256) -> __m256 {
    let fmaddsub = _mm256_fmaddsub_ps(a, b, c).as_f32x8();
    transmute(simd_select_bitmask(k, fmaddsub, a.as_f32x8()))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmaddsub_ps&expand=2610)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
pub unsafe fn _mm256_maskz_fmaddsub_ps(k: __mmask8, a: __m256, b: __m256, c: __m256) -> __m256 {
    let fmaddsub = _mm256_fmaddsub_ps(a, b, c).as_f32x8();
    let zero = _mm256_setzero_ps().as_f32x8();
    transmute(simd_select_bitmask(k, fmaddsub, zero))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmaddsub_ps&expand=2609)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
pub unsafe fn _mm256_mask3_fmaddsub_ps(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256 {
    let fmaddsub = _mm256_fmaddsub_ps(a, b, c).as_f32x8();
    transmute(simd_select_bitmask(k, fmaddsub, c.as_f32x8()))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmaddsub_ps&expand=2604)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
pub unsafe fn _mm_mask_fmaddsub_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
    let fmaddsub = _mm_fmaddsub_ps(a, b, c).as_f32x4();
    transmute(simd_select_bitmask(k, fmaddsub, a.as_f32x4()))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/IntrinsicsGuide/#text=_mm_maskz_fmaddsub_ps&expand=2606)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
pub unsafe fn _mm_maskz_fmaddsub_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
    let fmaddsub = _mm_fmaddsub_ps(a, b, c).as_f32x4();
    let zero = _mm_setzero_ps().as_f32x4();
    transmute(simd_select_bitmask(k, fmaddsub, zero))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmaddsub_ps&expand=2605)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
pub unsafe fn _mm_mask3_fmaddsub_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
    let fmaddsub = _mm_fmaddsub_ps(a, b, c).as_f32x4();
    transmute(simd_select_bitmask(k, fmaddsub, c.as_f32x4()))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmaddsub_pd&expand=2599)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
pub unsafe fn _mm512_fmaddsub_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
    transmute(vfmaddsub213pd(
        a.as_f64x8(),
        b.as_f64x8(),
        c.as_f64x8(),
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmaddsub_pd&expand=2600)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
pub unsafe fn _mm512_mask_fmaddsub_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d {
    let fmaddsub = _mm512_fmaddsub_pd(a, b, c).as_f64x8();
    transmute(simd_select_bitmask(k, fmaddsub, a.as_f64x8()))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmaddsub_pd&expand=2602)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
pub unsafe fn _mm512_maskz_fmaddsub_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d {
    let fmaddsub = _mm512_fmaddsub_pd(a, b, c).as_f64x8();
    let zero = _mm512_setzero_pd().as_f64x8();
    transmute(simd_select_bitmask(k, fmaddsub, zero))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmaddsub_ps&expand=2613)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
pub unsafe fn _mm512_mask3_fmaddsub_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d {
    let fmaddsub = _mm512_fmaddsub_pd(a, b, c).as_f64x8();
    transmute(simd_select_bitmask(k, fmaddsub, c.as_f64x8()))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmaddsub_pd&expand=2596)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
pub unsafe fn _mm256_mask_fmaddsub_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m256d) -> __m256d {
    let fmaddsub = _mm256_fmaddsub_pd(a, b, c).as_f64x4();
    transmute(simd_select_bitmask(k, fmaddsub, a.as_f64x4()))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmaddsub_pd&expand=2598)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
pub unsafe fn _mm256_maskz_fmaddsub_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m256d) -> __m256d {
    let fmaddsub = _mm256_fmaddsub_pd(a, b, c).as_f64x4();
    let zero = _mm256_setzero_pd().as_f64x4();
    transmute(simd_select_bitmask(k, fmaddsub, zero))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmaddsub_pd&expand=2597)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
pub unsafe fn _mm256_mask3_fmaddsub_pd(a: __m256d, b: __m256d, c: __m256d, k: __mmask8) -> __m256d {
    let fmaddsub = _mm256_fmaddsub_pd(a, b, c).as_f64x4();
    transmute(simd_select_bitmask(k, fmaddsub, c.as_f64x4()))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmaddsub_pd&expand=2592)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
pub unsafe fn _mm_mask_fmaddsub_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
    let fmaddsub = _mm_fmaddsub_pd(a, b, c).as_f64x2();
    transmute(simd_select_bitmask(k, fmaddsub, a.as_f64x2()))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmaddsub_pd&expand=2594)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
pub unsafe fn _mm_maskz_fmaddsub_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
    let fmaddsub = _mm_fmaddsub_pd(a, b, c).as_f64x2();
    let zero = _mm_setzero_pd().as_f64x2();
    transmute(simd_select_bitmask(k, fmaddsub, zero))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmaddsub_pd&expand=2593)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
pub unsafe fn _mm_mask3_fmaddsub_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
    let fmaddsub = _mm_fmaddsub_pd(a, b, c).as_f64x2();
    transmute(simd_select_bitmask(k, fmaddsub, c.as_f64x2()))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsubadd_ps&expand=2691)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
pub unsafe fn _mm512_fmsubadd_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
    let zero: f32x16 = mem::zeroed();
    let sub = simd_sub(zero, c.as_f32x16());
    transmute(vfmaddsub213ps(
        a.as_f32x16(),
        b.as_f32x16(),
        sub,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsubadd_ps&expand=2692)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
pub unsafe fn _mm512_mask_fmsubadd_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 {
    let fmsubadd = _mm512_fmsubadd_ps(a, b, c).as_f32x16();
    transmute(simd_select_bitmask(k, fmsubadd, a.as_f32x16()))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsubadd_ps&expand=2694)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
pub unsafe fn _mm512_maskz_fmsubadd_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 {
    let fmsubadd = _mm512_fmsubadd_ps(a, b, c).as_f32x16();
    let zero = _mm512_setzero_ps().as_f32x16();
    transmute(simd_select_bitmask(k, fmsubadd, zero))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsubadd_ps&expand=2693)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
pub unsafe fn _mm512_mask3_fmsubadd_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 {
    let fmsubadd = _mm512_fmsubadd_ps(a, b, c).as_f32x16();
    transmute(simd_select_bitmask(k, fmsubadd, c.as_f32x16()))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmsubadd_ps&expand=2688)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
pub unsafe fn _mm256_mask_fmsubadd_ps(a: __m256, k: __mmask8, b: __m256, c: __m256) -> __m256 {
    let fmsubadd = _mm256_fmsubadd_ps(a, b, c).as_f32x8();
    transmute(simd_select_bitmask(k, fmsubadd, a.as_f32x8()))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmsubadd_ps&expand=2690)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
pub unsafe fn _mm256_maskz_fmsubadd_ps(k: __mmask8, a: __m256, b: __m256, c: __m256) -> __m256 {
    let fmsubadd = _mm256_fmsubadd_ps(a, b, c).as_f32x8();
    let zero = _mm256_setzero_ps().as_f32x8();
    transmute(simd_select_bitmask(k, fmsubadd, zero))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmsubadd_ps&expand=2689)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
pub unsafe fn _mm256_mask3_fmsubadd_ps(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256 {
    let fmsubadd = _mm256_fmsubadd_ps(a, b, c).as_f32x8();
    transmute(simd_select_bitmask(k, fmsubadd, c.as_f32x8()))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsubadd_ps&expand=2684)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
pub unsafe fn _mm_mask_fmsubadd_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
    let fmsubadd = _mm_fmsubadd_ps(a, b, c).as_f32x4();
    transmute(simd_select_bitmask(k, fmsubadd, a.as_f32x4()))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsubadd_ps&expand=2686)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
pub unsafe fn _mm_maskz_fmsubadd_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
    let fmsubadd = _mm_fmsubadd_ps(a, b, c).as_f32x4();
    let zero = _mm_setzero_ps().as_f32x4();
    transmute(simd_select_bitmask(k, fmsubadd, zero))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsubadd_ps&expand=2685)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
pub unsafe fn _mm_mask3_fmsubadd_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
    let fmsubadd = _mm_fmsubadd_ps(a, b, c).as_f32x4();
    transmute(simd_select_bitmask(k, fmsubadd, c.as_f32x4()))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsubadd_pd&expand=2679)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
pub unsafe fn _mm512_fmsubadd_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
    let zero: f64x8 = mem::zeroed();
    let sub = simd_sub(zero, c.as_f64x8());
    transmute(vfmaddsub213pd(
        a.as_f64x8(),
        b.as_f64x8(),
        sub,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsubadd_pd&expand=2680)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
pub unsafe fn _mm512_mask_fmsubadd_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d {
    let fmsubadd = _mm512_fmsubadd_pd(a, b, c).as_f64x8();
    transmute(simd_select_bitmask(k, fmsubadd, a.as_f64x8()))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsubadd_pd&expand=2682)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
pub unsafe fn _mm512_maskz_fmsubadd_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d {
    let fmsubadd = _mm512_fmsubadd_pd(a, b, c).as_f64x8();
    let zero = _mm512_setzero_pd().as_f64x8();
    transmute(simd_select_bitmask(k, fmsubadd, zero))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsubadd_pd&expand=2681)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmaddsub))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
pub unsafe fn _mm512_mask3_fmsubadd_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d {
    let fmsubadd = _mm512_fmsubadd_pd(a, b, c).as_f64x8();
    transmute(simd_select_bitmask(k, fmsubadd, c.as_f64x8()))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fmsubadd_pd&expand=2676)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
pub unsafe fn _mm256_mask_fmsubadd_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m256d) -> __m256d {
    let fmsubadd = _mm256_fmsubadd_pd(a, b, c).as_f64x4();
    transmute(simd_select_bitmask(k, fmsubadd, a.as_f64x4()))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fmsubadd_pd&expand=2678)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
pub unsafe fn _mm256_maskz_fmsubadd_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m256d) -> __m256d {
    let fmsubadd = _mm256_fmsubadd_pd(a, b, c).as_f64x4();
    let zero = _mm256_setzero_pd().as_f64x4();
    transmute(simd_select_bitmask(k, fmsubadd, zero))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fmsubadd_pd&expand=2677)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
pub unsafe fn _mm256_mask3_fmsubadd_pd(a: __m256d, b: __m256d, c: __m256d, k: __mmask8) -> __m256d {
    let fmsubadd = _mm256_fmsubadd_pd(a, b, c).as_f64x4();
    transmute(simd_select_bitmask(k, fmsubadd, c.as_f64x4()))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fmsubadd_pd&expand=2672)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
pub unsafe fn _mm_mask_fmsubadd_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
    let fmsubadd = _mm_fmsubadd_pd(a, b, c).as_f64x2();
    transmute(simd_select_bitmask(k, fmsubadd, a.as_f64x2()))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fmsubadd_pd&expand=2674)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
pub unsafe fn _mm_maskz_fmsubadd_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
    let fmsubadd = _mm_fmsubadd_pd(a, b, c).as_f64x2();
    let zero = _mm_setzero_pd().as_f64x2();
    transmute(simd_select_bitmask(k, fmsubadd, zero))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fmsubadd_pd&expand=2673)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfmsubadd))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
pub unsafe fn _mm_mask3_fmsubadd_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
    let fmsubadd = _mm_fmsubadd_pd(a, b, c).as_f64x2();
    transmute(simd_select_bitmask(k, fmsubadd, c.as_f64x2()))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmadd_ps&expand=2723)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
pub unsafe fn _mm512_fnmadd_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
    let zero: f32x16 = mem::zeroed();
    let sub = simd_sub(zero, a.as_f32x16());
    transmute(vfmadd132ps(sub, b.as_f32x16(), c.as_f32x16()))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmadd_ps&expand=2724)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
pub unsafe fn _mm512_mask_fnmadd_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 {
    let fnmadd = _mm512_fnmadd_ps(a, b, c).as_f32x16();
    transmute(simd_select_bitmask(k, fnmadd, a.as_f32x16()))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmadd_ps&expand=2726)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
pub unsafe fn _mm512_maskz_fnmadd_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 {
    let fnmadd = _mm512_fnmadd_ps(a, b, c).as_f32x16();
    let zero = _mm512_setzero_ps().as_f32x16();
    transmute(simd_select_bitmask(k, fnmadd, zero))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmadd_ps&expand=2725)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
pub unsafe fn _mm512_mask3_fnmadd_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 {
    let fnmadd = _mm512_fnmadd_ps(a, b, c).as_f32x16();
    transmute(simd_select_bitmask(k, fnmadd, c.as_f32x16()))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fnmadd_ps&expand=2720)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
pub unsafe fn _mm256_mask_fnmadd_ps(a: __m256, k: __mmask8, b: __m256, c: __m256) -> __m256 {
    let fnmadd = _mm256_fnmadd_ps(a, b, c).as_f32x8();
    transmute(simd_select_bitmask(k, fnmadd, a.as_f32x8()))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fnmadd_ps&expand=2722)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
pub unsafe fn _mm256_maskz_fnmadd_ps(k: __mmask8, a: __m256, b: __m256, c: __m256) -> __m256 {
    let fnmadd = _mm256_fnmadd_ps(a, b, c).as_f32x8();
    let zero = _mm256_setzero_ps().as_f32x8();
    transmute(simd_select_bitmask(k, fnmadd, zero))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fnmadd_ps&expand=2721)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
pub unsafe fn _mm256_mask3_fnmadd_ps(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256 {
    let fnmadd = _mm256_fnmadd_ps(a, b, c).as_f32x8();
    transmute(simd_select_bitmask(k, fnmadd, c.as_f32x8()))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmadd_ps&expand=2716)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
pub unsafe fn _mm_mask_fnmadd_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
    let fnmadd = _mm_fnmadd_ps(a, b, c).as_f32x4();
    transmute(simd_select_bitmask(k, fnmadd, a.as_f32x4()))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmadd_ps&expand=2718)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
pub unsafe fn _mm_maskz_fnmadd_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
    let fnmadd = _mm_fnmadd_ps(a, b, c).as_f32x4();
    let zero = _mm_setzero_ps().as_f32x4();
    transmute(simd_select_bitmask(k, fnmadd, zero))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmadd_ps&expand=2717)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
pub unsafe fn _mm_mask3_fnmadd_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
    let fnmadd = _mm_fnmadd_ps(a, b, c).as_f32x4();
    transmute(simd_select_bitmask(k, fnmadd, c.as_f32x4()))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmadd_pd&expand=2711)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
pub unsafe fn _mm512_fnmadd_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
    let zero: f64x8 = mem::zeroed();
    let sub = simd_sub(zero, a.as_f64x8());
    transmute(vfmadd132pd(sub, b.as_f64x8(), c.as_f64x8()))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmadd_pd&expand=2712)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
pub unsafe fn _mm512_mask_fnmadd_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d {
    let fnmadd = _mm512_fnmadd_pd(a, b, c).as_f64x8();
    transmute(simd_select_bitmask(k, fnmadd, a.as_f64x8()))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmadd_pd&expand=2714)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
pub unsafe fn _mm512_maskz_fnmadd_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d {
    let fnmadd = _mm512_fnmadd_pd(a, b, c).as_f64x8();
    let zero = _mm512_setzero_pd().as_f64x8();
    transmute(simd_select_bitmask(k, fnmadd, zero))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmadd_pd&expand=2713)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
pub unsafe fn _mm512_mask3_fnmadd_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d {
    let fnmadd = _mm512_fnmadd_pd(a, b, c).as_f64x8();
    transmute(simd_select_bitmask(k, fnmadd, c.as_f64x8()))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fnmadd_pd&expand=2708)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
pub unsafe fn _mm256_mask_fnmadd_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m256d) -> __m256d {
    let fnmadd = _mm256_fnmadd_pd(a, b, c).as_f64x4();
    transmute(simd_select_bitmask(k, fnmadd, a.as_f64x4()))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fnmadd_pd&expand=2710)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
pub unsafe fn _mm256_maskz_fnmadd_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m256d) -> __m256d {
    let fnmadd = _mm256_fnmadd_pd(a, b, c).as_f64x4();
    let zero = _mm256_setzero_pd().as_f64x4();
    transmute(simd_select_bitmask(k, fnmadd, zero))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fnmadd_pd&expand=2709)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
pub unsafe fn _mm256_mask3_fnmadd_pd(a: __m256d, b: __m256d, c: __m256d, k: __mmask8) -> __m256d {
    let fnmadd = _mm256_fnmadd_pd(a, b, c).as_f64x4();
    transmute(simd_select_bitmask(k, fnmadd, c.as_f64x4()))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmadd_pd&expand=2704)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
pub unsafe fn _mm_mask_fnmadd_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
    let fnmadd = _mm_fnmadd_pd(a, b, c).as_f64x2();
    transmute(simd_select_bitmask(k, fnmadd, a.as_f64x2()))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmadd_pd&expand=2706)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
pub unsafe fn _mm_maskz_fnmadd_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
    let fnmadd = _mm_fnmadd_pd(a, b, c).as_f64x2();
    let zero = _mm_setzero_pd().as_f64x2();
    transmute(simd_select_bitmask(k, fnmadd, zero))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmadd_pd&expand=2705)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfnmadd))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
pub unsafe fn _mm_mask3_fnmadd_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
    let fnmadd = _mm_fnmadd_pd(a, b, c).as_f64x2();
    transmute(simd_select_bitmask(k, fnmadd, c.as_f64x2()))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmsub_ps&expand=2771)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
pub unsafe fn _mm512_fnmsub_ps(a: __m512, b: __m512, c: __m512) -> __m512 {
    let zero: f32x16 = mem::zeroed();
    let suba = simd_sub(zero, a.as_f32x16());
    let subc = simd_sub(zero, c.as_f32x16());
    transmute(vfmadd132ps(suba, b.as_f32x16(), subc))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmsub_ps&expand=2772)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
pub unsafe fn _mm512_mask_fnmsub_ps(a: __m512, k: __mmask16, b: __m512, c: __m512) -> __m512 {
    let fnmsub = _mm512_fnmsub_ps(a, b, c).as_f32x16();
    transmute(simd_select_bitmask(k, fnmsub, a.as_f32x16()))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmsub_ps&expand=2774)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
pub unsafe fn _mm512_maskz_fnmsub_ps(k: __mmask16, a: __m512, b: __m512, c: __m512) -> __m512 {
    let fnmsub = _mm512_fnmsub_ps(a, b, c).as_f32x16();
    let zero = _mm512_setzero_ps().as_f32x16();
    transmute(simd_select_bitmask(k, fnmsub, zero))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmsub_ps&expand=2773)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
pub unsafe fn _mm512_mask3_fnmsub_ps(a: __m512, b: __m512, c: __m512, k: __mmask16) -> __m512 {
    let fnmsub = _mm512_fnmsub_ps(a, b, c).as_f32x16();
    transmute(simd_select_bitmask(k, fnmsub, c.as_f32x16()))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fnmsub_ps&expand=2768)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
pub unsafe fn _mm256_mask_fnmsub_ps(a: __m256, k: __mmask8, b: __m256, c: __m256) -> __m256 {
    let fnmsub = _mm256_fnmsub_ps(a, b, c).as_f32x8();
    transmute(simd_select_bitmask(k, fnmsub, a.as_f32x8()))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fnmsub_ps&expand=2770)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
pub unsafe fn _mm256_maskz_fnmsub_ps(k: __mmask8, a: __m256, b: __m256, c: __m256) -> __m256 {
    let fnmsub = _mm256_fnmsub_ps(a, b, c).as_f32x8();
    let zero = _mm256_setzero_ps().as_f32x8();
    transmute(simd_select_bitmask(k, fnmsub, zero))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fnmsub_ps&expand=2769)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
pub unsafe fn _mm256_mask3_fnmsub_ps(a: __m256, b: __m256, c: __m256, k: __mmask8) -> __m256 {
    let fnmsub = _mm256_fnmsub_ps(a, b, c).as_f32x8();
    transmute(simd_select_bitmask(k, fnmsub, c.as_f32x8()))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmsub_ps&expand=2764)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
pub unsafe fn _mm_mask_fnmsub_ps(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
    let fnmsub = _mm_fnmsub_ps(a, b, c).as_f32x4();
    transmute(simd_select_bitmask(k, fnmsub, a.as_f32x4()))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmsub_ps&expand=2766)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
pub unsafe fn _mm_maskz_fnmsub_ps(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
    let fnmsub = _mm_fnmsub_ps(a, b, c).as_f32x4();
    let zero = _mm_setzero_ps().as_f32x4();
    transmute(simd_select_bitmask(k, fnmsub, zero))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmsub_ps&expand=2765)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
pub unsafe fn _mm_mask3_fnmsub_ps(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
    let fnmsub = _mm_fnmsub_ps(a, b, c).as_f32x4();
    transmute(simd_select_bitmask(k, fnmsub, c.as_f32x4()))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmsub_pd&expand=2759)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
pub unsafe fn _mm512_fnmsub_pd(a: __m512d, b: __m512d, c: __m512d) -> __m512d {
    let zero: f64x8 = mem::zeroed();
    let suba = simd_sub(zero, a.as_f64x8());
    let subc = simd_sub(zero, c.as_f64x8());
    transmute(vfmadd132pd(suba, b.as_f64x8(), subc))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmsub_pd&expand=2760)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
pub unsafe fn _mm512_mask_fnmsub_pd(a: __m512d, k: __mmask8, b: __m512d, c: __m512d) -> __m512d {
    let fnmsub = _mm512_fnmsub_pd(a, b, c).as_f64x8();
    transmute(simd_select_bitmask(k, fnmsub, a.as_f64x8()))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmsub_pd&expand=2762)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
pub unsafe fn _mm512_maskz_fnmsub_pd(k: __mmask8, a: __m512d, b: __m512d, c: __m512d) -> __m512d {
    let fnmsub = _mm512_fnmsub_pd(a, b, c).as_f64x8();
    let zero = _mm512_setzero_pd().as_f64x8();
    transmute(simd_select_bitmask(k, fnmsub, zero))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmsub_pd&expand=2761)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
pub unsafe fn _mm512_mask3_fnmsub_pd(a: __m512d, b: __m512d, c: __m512d, k: __mmask8) -> __m512d {
    let fnmsub = _mm512_fnmsub_pd(a, b, c).as_f64x8();
    transmute(simd_select_bitmask(k, fnmsub, c.as_f64x8()))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fnmsub_pd&expand=2756)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
pub unsafe fn _mm256_mask_fnmsub_pd(a: __m256d, k: __mmask8, b: __m256d, c: __m256d) -> __m256d {
    let fnmsub = _mm256_fnmsub_pd(a, b, c).as_f64x4();
    transmute(simd_select_bitmask(k, fnmsub, a.as_f64x4()))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fnmsub_pd&expand=2758)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
pub unsafe fn _mm256_maskz_fnmsub_pd(k: __mmask8, a: __m256d, b: __m256d, c: __m256d) -> __m256d {
    let fnmsub = _mm256_fnmsub_pd(a, b, c).as_f64x4();
    let zero = _mm256_setzero_pd().as_f64x4();
    transmute(simd_select_bitmask(k, fnmsub, zero))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask3_fnmsub_pd&expand=2757)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
pub unsafe fn _mm256_mask3_fnmsub_pd(a: __m256d, b: __m256d, c: __m256d, k: __mmask8) -> __m256d {
    let fnmsub = _mm256_fnmsub_pd(a, b, c).as_f64x4();
    transmute(simd_select_bitmask(k, fnmsub, c.as_f64x4()))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fnmsub_pd&expand=2752)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
pub unsafe fn _mm_mask_fnmsub_pd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
    let fnmsub = _mm_fnmsub_pd(a, b, c).as_f64x2();
    transmute(simd_select_bitmask(k, fnmsub, a.as_f64x2()))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fnmsub_pd&expand=2754)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
pub unsafe fn _mm_maskz_fnmsub_pd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
    let fnmsub = _mm_fnmsub_pd(a, b, c).as_f64x2();
    let zero = _mm_setzero_pd().as_f64x2();
    transmute(simd_select_bitmask(k, fnmsub, zero))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask3_fnmsub_pd&expand=2753)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfnmsub))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
pub unsafe fn _mm_mask3_fnmsub_pd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
    let fnmsub = _mm_fnmsub_pd(a, b, c).as_f64x2();
    transmute(simd_select_bitmask(k, fnmsub, c.as_f64x2()))
}

/// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rcp14_ps&expand=4502)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vrcp14ps))]
pub unsafe fn _mm512_rcp14_ps(a: __m512) -> __m512 {
    transmute(vrcp14ps(
        a.as_f32x16(),
        _mm512_setzero_ps().as_f32x16(),
        0b11111111_11111111,
    ))
}

/// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rcp14_ps&expand=4500)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vrcp14ps))]
pub unsafe fn _mm512_mask_rcp14_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
    transmute(vrcp14ps(a.as_f32x16(), src.as_f32x16(), k))
}

/// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rcp14_ps&expand=4501)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vrcp14ps))]
pub unsafe fn _mm512_maskz_rcp14_ps(k: __mmask16, a: __m512) -> __m512 {
    transmute(vrcp14ps(a.as_f32x16(), _mm512_setzero_ps().as_f32x16(), k))
}

/// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_rcp14_ps&expand=4499)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vrcp14ps))]
pub unsafe fn _mm256_rcp14_ps(a: __m256) -> __m256 {
    transmute(vrcp14ps256(
        a.as_f32x8(),
        _mm256_setzero_ps().as_f32x8(),
        0b11111111,
    ))
}

/// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_rcp14_ps&expand=4497)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vrcp14ps))]
pub unsafe fn _mm256_mask_rcp14_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
    transmute(vrcp14ps256(a.as_f32x8(), src.as_f32x8(), k))
}

/// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_rcp14_ps&expand=4498)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vrcp14ps))]
pub unsafe fn _mm256_maskz_rcp14_ps(k: __mmask8, a: __m256) -> __m256 {
    transmute(vrcp14ps256(a.as_f32x8(), _mm256_setzero_ps().as_f32x8(), k))
}

/// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp14_ps&expand=4496)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vrcp14ps))]
pub unsafe fn _mm_rcp14_ps(a: __m128) -> __m128 {
    transmute(vrcp14ps128(
        a.as_f32x4(),
        _mm_setzero_ps().as_f32x4(),
        0b00001111,
    ))
}

/// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rcp14_ps&expand=4494)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vrcp14ps))]
pub unsafe fn _mm_mask_rcp14_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
    transmute(vrcp14ps128(a.as_f32x4(), src.as_f32x4(), k))
}

/// Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rcp14_ps&expand=4495)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vrcp14ps))]
pub unsafe fn _mm_maskz_rcp14_ps(k: __mmask8, a: __m128) -> __m128 {
    transmute(vrcp14ps128(a.as_f32x4(), _mm_setzero_ps().as_f32x4(), k))
}

/// Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rcp14_pd&expand=4493)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vrcp14pd))]
pub unsafe fn _mm512_rcp14_pd(a: __m512d) -> __m512d {
    transmute(vrcp14pd(
        a.as_f64x8(),
        _mm512_setzero_pd().as_f64x8(),
        0b11111111,
    ))
}

/// Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rcp14_pd&expand=4491)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vrcp14pd))]
pub unsafe fn _mm512_mask_rcp14_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
    transmute(vrcp14pd(a.as_f64x8(), src.as_f64x8(), k))
}

/// Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rcp14_pd&expand=4492)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vrcp14pd))]
pub unsafe fn _mm512_maskz_rcp14_pd(k: __mmask8, a: __m512d) -> __m512d {
    transmute(vrcp14pd(a.as_f64x8(), _mm512_setzero_pd().as_f64x8(), k))
}

/// Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_rcp14_pd&expand=4490)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vrcp14pd))]
pub unsafe fn _mm256_rcp14_pd(a: __m256d) -> __m256d {
    transmute(vrcp14pd256(
        a.as_f64x4(),
        _mm256_setzero_pd().as_f64x4(),
        0b00001111,
    ))
}

/// Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_rcp14_pd&expand=4488)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vrcp14pd))]
pub unsafe fn _mm256_mask_rcp14_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
    transmute(vrcp14pd256(a.as_f64x4(), src.as_f64x4(), k))
}

/// Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_rcp14_pd&expand=4489)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vrcp14pd))]
pub unsafe fn _mm256_maskz_rcp14_pd(k: __mmask8, a: __m256d) -> __m256d {
    transmute(vrcp14pd256(a.as_f64x4(), _mm256_setzero_pd().as_f64x4(), k))
}

/// Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp14_pd&expand=4487)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vrcp14pd))]
pub unsafe fn _mm_rcp14_pd(a: __m128d) -> __m128d {
    transmute(vrcp14pd128(
        a.as_f64x2(),
        _mm_setzero_pd().as_f64x2(),
        0b00000011,
    ))
}

/// Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rcp14_pd&expand=4485)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vrcp14pd))]
pub unsafe fn _mm_mask_rcp14_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
    transmute(vrcp14pd128(a.as_f64x2(), src.as_f64x2(), k))
}

/// Compute the approximate reciprocal of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rcp14_pd&expand=4486)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vrcp14pd))]
pub unsafe fn _mm_maskz_rcp14_pd(k: __mmask8, a: __m128d) -> __m128d {
    transmute(vrcp14pd128(a.as_f64x2(), _mm_setzero_pd().as_f64x2(), k))
}

/// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rsqrt14_ps&expand=4819)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vrsqrt14ps))]
pub unsafe fn _mm512_rsqrt14_ps(a: __m512) -> __m512 {
    transmute(vrsqrt14ps(
        a.as_f32x16(),
        _mm512_setzero_ps().as_f32x16(),
        0b11111111_11111111,
    ))
}

/// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rsqrt14_ps&expand=4817)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vrsqrt14ps))]
pub unsafe fn _mm512_mask_rsqrt14_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
    transmute(vrsqrt14ps(a.as_f32x16(), src.as_f32x16(), k))
}

/// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rsqrt14_ps&expand=4818)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vrsqrt14ps))]
pub unsafe fn _mm512_maskz_rsqrt14_ps(k: __mmask16, a: __m512) -> __m512 {
    transmute(vrsqrt14ps(
        a.as_f32x16(),
        _mm512_setzero_ps().as_f32x16(),
        k,
    ))
}

/// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_rsqrt14_ps&expand=4815)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vrsqrt14ps))]
pub unsafe fn _mm256_mask_rsqrt14_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
    transmute(vrsqrt14ps256(a.as_f32x8(), src.as_f32x8(), k))
}

/// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_rsqrt14_ps&expand=4816)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vrsqrt14ps))]
pub unsafe fn _mm256_maskz_rsqrt14_ps(k: __mmask8, a: __m256) -> __m256 {
    transmute(vrsqrt14ps256(
        a.as_f32x8(),
        _mm256_setzero_ps().as_f32x8(),
        k,
    ))
}

/// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rsqrt14_ps&expand=4813)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vrsqrt14ps))]
pub unsafe fn _mm_mask_rsqrt14_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
    transmute(vrsqrt14ps128(a.as_f32x4(), src.as_f32x4(), k))
}

/// Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rsqrt14_ps&expand=4814)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vrsqrt14ps))]
pub unsafe fn _mm_maskz_rsqrt14_ps(k: __mmask8, a: __m128) -> __m128 {
    transmute(vrsqrt14ps128(a.as_f32x4(), _mm_setzero_ps().as_f32x4(), k))
}

/// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. The maximum relative error for this approximation is less than 2^-14.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rsqrt14_pd&expand=4812)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vrsqrt14pd))]
pub unsafe fn _mm512_rsqrt14_pd(a: __m512d) -> __m512d {
    transmute(vrsqrt14pd(
        a.as_f64x8(),
        _mm512_setzero_pd().as_f64x8(),
        0b11111111,
    ))
}

/// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rsqrt14_pd&expand=4810)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vrsqrt14pd))]
pub unsafe fn _mm512_mask_rsqrt14_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
    transmute(vrsqrt14pd(a.as_f64x8(), src.as_f64x8(), k))
}

/// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rsqrt14_pd&expand=4811)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vrsqrt14pd))]
pub unsafe fn _mm512_maskz_rsqrt14_pd(k: __mmask8, a: __m512d) -> __m512d {
    transmute(vrsqrt14pd(a.as_f64x8(), _mm512_setzero_pd().as_f64x8(), k))
}

/// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_rsqrt14_pd&expand=4808)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vrsqrt14pd))]
pub unsafe fn _mm256_mask_rsqrt14_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
    transmute(vrsqrt14pd256(a.as_f64x4(), src.as_f64x4(), k))
}

/// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_rsqrt14_pd&expand=4809)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vrsqrt14pd))]
pub unsafe fn _mm256_maskz_rsqrt14_pd(k: __mmask8, a: __m256d) -> __m256d {
    transmute(vrsqrt14pd256(
        a.as_f64x4(),
        _mm256_setzero_pd().as_f64x4(),
        k,
    ))
}

/// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rsqrt14_pd&expand=4806)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vrsqrt14pd))]
pub unsafe fn _mm_mask_rsqrt14_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
    transmute(vrsqrt14pd128(a.as_f64x2(), src.as_f64x2(), k))
}

/// Compute the approximate reciprocal square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). The maximum relative error for this approximation is less than 2^-14.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rsqrt14_pd&expand=4807)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vrsqrt14pd))]
pub unsafe fn _mm_maskz_rsqrt14_pd(k: __mmask8, a: __m128d) -> __m128d {
    transmute(vrsqrt14pd128(a.as_f64x2(), _mm_setzero_pd().as_f64x2(), k))
}

/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getexp_ps&expand=2844)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgetexpps))]
pub unsafe fn _mm512_getexp_ps(a: __m512) -> __m512 {
    transmute(vgetexpps(
        a.as_f32x16(),
        _mm512_setzero_ps().as_f32x16(),
        0b11111111_11111111,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getexp_ps&expand=2845)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgetexpps))]
pub unsafe fn _mm512_mask_getexp_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
    transmute(vgetexpps(
        a.as_f32x16(),
        src.as_f32x16(),
        k,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getexp_ps&expand=2846)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgetexpps))]
pub unsafe fn _mm512_maskz_getexp_ps(k: __mmask16, a: __m512) -> __m512 {
    transmute(vgetexpps(
        a.as_f32x16(),
        _mm512_setzero_ps().as_f32x16(),
        k,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_getexp_ps&expand=2841)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vgetexpps))]
pub unsafe fn _mm256_getexp_ps(a: __m256) -> __m256 {
    transmute(vgetexpps256(
        a.as_f32x8(),
        _mm256_setzero_ps().as_f32x8(),
        0b11111111,
    ))
}

/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_getexp_ps&expand=2842)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vgetexpps))]
pub unsafe fn _mm256_mask_getexp_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
    transmute(vgetexpps256(a.as_f32x8(), src.as_f32x8(), k))
}

/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_getexp_ps&expand=2843)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vgetexpps))]
pub unsafe fn _mm256_maskz_getexp_ps(k: __mmask8, a: __m256) -> __m256 {
    transmute(vgetexpps256(
        a.as_f32x8(),
        _mm256_setzero_ps().as_f32x8(),
        k,
    ))
}

/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_ps&expand=2838)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vgetexpps))]
pub unsafe fn _mm_getexp_ps(a: __m128) -> __m128 {
    transmute(vgetexpps128(
        a.as_f32x4(),
        _mm_setzero_ps().as_f32x4(),
        0b00001111,
    ))
}

/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_ps&expand=2839)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vgetexpps))]
pub unsafe fn _mm_mask_getexp_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
    transmute(vgetexpps128(a.as_f32x4(), src.as_f32x4(), k))
}

/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_ps&expand=2840)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vgetexpps))]
pub unsafe fn _mm_maskz_getexp_ps(k: __mmask8, a: __m128) -> __m128 {
    transmute(vgetexpps128(a.as_f32x4(), _mm_setzero_ps().as_f32x4(), k))
}

/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getexp_pd&expand=2835)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgetexppd))]
pub unsafe fn _mm512_getexp_pd(a: __m512d) -> __m512d {
    transmute(vgetexppd(
        a.as_f64x8(),
        _mm512_setzero_pd().as_f64x8(),
        0b11111111,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getexp_pd&expand=2836)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgetexppd))]
pub unsafe fn _mm512_mask_getexp_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
    transmute(vgetexppd(
        a.as_f64x8(),
        src.as_f64x8(),
        k,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getexp_pd&expand=2837)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgetexppd))]
pub unsafe fn _mm512_maskz_getexp_pd(k: __mmask8, a: __m512d) -> __m512d {
    transmute(vgetexppd(
        a.as_f64x8(),
        _mm512_setzero_pd().as_f64x8(),
        k,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_getexp_pd&expand=2832)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vgetexppd))]
pub unsafe fn _mm256_getexp_pd(a: __m256d) -> __m256d {
    transmute(vgetexppd256(
        a.as_f64x4(),
        _mm256_setzero_pd().as_f64x4(),
        0b00001111,
    ))
}

/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_getexp_pd&expand=2833)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vgetexppd))]
pub unsafe fn _mm256_mask_getexp_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
    transmute(vgetexppd256(a.as_f64x4(), src.as_f64x4(), k))
}

/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_getexp_pd&expand=2834)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vgetexppd))]
pub unsafe fn _mm256_maskz_getexp_pd(k: __mmask8, a: __m256d) -> __m256d {
    transmute(vgetexppd256(
        a.as_f64x4(),
        _mm256_setzero_pd().as_f64x4(),
        k,
    ))
}

/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getexp_pd&expand=2829)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vgetexppd))]
pub unsafe fn _mm_getexp_pd(a: __m128d) -> __m128d {
    transmute(vgetexppd128(
        a.as_f64x2(),
        _mm_setzero_pd().as_f64x2(),
        0b00000011,
    ))
}

/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getexp_pd&expand=2830)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vgetexppd))]
pub unsafe fn _mm_mask_getexp_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
    transmute(vgetexppd128(a.as_f64x2(), src.as_f64x2(), k))
}

/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getexp_pd&expand=2831)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vgetexppd))]
pub unsafe fn _mm_maskz_getexp_pd(k: __mmask8, a: __m128d) -> __m128d {
    transmute(vgetexppd128(a.as_f64x2(), _mm_setzero_pd().as_f64x2(), k))
}

/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.\
/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
///    _MM_FROUND_TO_NEG_INF     // round down\
///    _MM_FROUND_TO_POS_INF     // round up\
///    _MM_FROUND_TO_ZERO        // truncate\
///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_ps&expand=4784)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm512_roundscale_ps<const IMM8: i32>(a: __m512) -> __m512 {
    static_assert_imm8!(IMM8);
    let a = a.as_f32x16();
    let zero = _mm512_setzero_ps().as_f32x16();
    let r = vrndscaleps(a, IMM8, zero, 0b11111111_11111111, _MM_FROUND_CUR_DIRECTION);
    transmute(r)
}

/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
///    _MM_FROUND_TO_NEG_INF     // round down\
///    _MM_FROUND_TO_POS_INF     // round up\
///    _MM_FROUND_TO_ZERO        // truncate\
///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_ps&expand=4782)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_mask_roundscale_ps<const IMM8: i32>(
    src: __m512,
    k: __mmask16,
    a: __m512,
) -> __m512 {
    static_assert_imm8!(IMM8);
    let a = a.as_f32x16();
    let src = src.as_f32x16();
    let r = vrndscaleps(a, IMM8, src, k, _MM_FROUND_CUR_DIRECTION);
    transmute(r)
}

/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
///    _MM_FROUND_TO_NEG_INF     // round down\
///    _MM_FROUND_TO_POS_INF     // round up\
///    _MM_FROUND_TO_ZERO        // truncate\
///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_ps&expand=4783)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_maskz_roundscale_ps<const IMM8: i32>(k: __mmask16, a: __m512) -> __m512 {
    static_assert_imm8!(IMM8);
    let a = a.as_f32x16();
    let zero = _mm512_setzero_ps().as_f32x16();
    let r = vrndscaleps(a, IMM8, zero, k, _MM_FROUND_CUR_DIRECTION);
    transmute(r)
}

/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.\
/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
///    _MM_FROUND_TO_NEG_INF     // round down\
///    _MM_FROUND_TO_POS_INF     // round up\
///    _MM_FROUND_TO_ZERO        // truncate\
///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_roundscale_ps&expand=4781)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 250))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm256_roundscale_ps<const IMM8: i32>(a: __m256) -> __m256 {
    static_assert_imm8!(IMM8);
    let a = a.as_f32x8();
    let zero = _mm256_setzero_ps().as_f32x8();
    let r = vrndscaleps256(a, IMM8, zero, 0b11111111);
    transmute(r)
}

/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
///    _MM_FROUND_TO_NEG_INF     // round down\
///    _MM_FROUND_TO_POS_INF     // round up\
///    _MM_FROUND_TO_ZERO        // truncate\
///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_roundscale_ps&expand=4779)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm256_mask_roundscale_ps<const IMM8: i32>(
    src: __m256,
    k: __mmask8,
    a: __m256,
) -> __m256 {
    static_assert_imm8!(IMM8);
    let a = a.as_f32x8();
    let src = src.as_f32x8();
    let r = vrndscaleps256(a, IMM8, src, k);
    transmute(r)
}

/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
///    _MM_FROUND_TO_NEG_INF     // round down\
///    _MM_FROUND_TO_POS_INF     // round up\
///    _MM_FROUND_TO_ZERO        // truncate\
///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_roundscale_ps&expand=4780)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm256_maskz_roundscale_ps<const IMM8: i32>(k: __mmask8, a: __m256) -> __m256 {
    static_assert_imm8!(IMM8);
    let a = a.as_f32x8();
    let zero = _mm256_setzero_ps().as_f32x8();
    let r = vrndscaleps256(a, IMM8, zero, k);
    transmute(r)
}

/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.\
/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
///    _MM_FROUND_TO_NEG_INF     // round down\
///    _MM_FROUND_TO_POS_INF     // round up\
///    _MM_FROUND_TO_ZERO        // truncate\
///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_ps&expand=4778)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 250))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm_roundscale_ps<const IMM8: i32>(a: __m128) -> __m128 {
    static_assert_imm8!(IMM8);
    let a = a.as_f32x4();
    let zero = _mm_setzero_ps().as_f32x4();
    let r = vrndscaleps128(a, IMM8, zero, 0b00001111);
    transmute(r)
}

/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
///    _MM_FROUND_TO_NEG_INF     // round down\
///    _MM_FROUND_TO_POS_INF     // round up\
///    _MM_FROUND_TO_ZERO        // truncate\
///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_ps&expand=4776)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm_mask_roundscale_ps<const IMM8: i32>(
    src: __m128,
    k: __mmask8,
    a: __m128,
) -> __m128 {
    static_assert_imm8!(IMM8);
    let a = a.as_f32x4();
    let src = src.as_f32x4();
    let r = vrndscaleps128(a, IMM8, src, k);
    transmute(r)
}

/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
///    _MM_FROUND_TO_NEG_INF     // round down\
///    _MM_FROUND_TO_POS_INF     // round up\
///    _MM_FROUND_TO_ZERO        // truncate\
///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_ps&expand=4777)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm_maskz_roundscale_ps<const IMM8: i32>(k: __mmask8, a: __m128) -> __m128 {
    static_assert_imm8!(IMM8);
    let a = a.as_f32x4();
    let zero = _mm_setzero_ps().as_f32x4();
    let r = vrndscaleps128(a, IMM8, zero, k);
    transmute(r)
}

/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.\
/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
///    _MM_FROUND_TO_NEG_INF     // round down\
///    _MM_FROUND_TO_POS_INF     // round up\
///    _MM_FROUND_TO_ZERO        // truncate\
///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_pd&expand=4775)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm512_roundscale_pd<const IMM8: i32>(a: __m512d) -> __m512d {
    static_assert_imm8!(IMM8);
    let a = a.as_f64x8();
    let zero = _mm512_setzero_pd().as_f64x8();
    let r = vrndscalepd(a, IMM8, zero, 0b11111111, _MM_FROUND_CUR_DIRECTION);
    transmute(r)
}

/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
///    _MM_FROUND_TO_NEG_INF     // round down\
///    _MM_FROUND_TO_POS_INF     // round up\
///    _MM_FROUND_TO_ZERO        // truncate\
///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_pd&expand=4773)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_mask_roundscale_pd<const IMM8: i32>(
    src: __m512d,
    k: __mmask8,
    a: __m512d,
) -> __m512d {
    static_assert_imm8!(IMM8);
    let a = a.as_f64x8();
    let src = src.as_f64x8();
    let r = vrndscalepd(a, IMM8, src, k, _MM_FROUND_CUR_DIRECTION);
    transmute(r)
}

/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
///    _MM_FROUND_TO_NEG_INF     // round down\
///    _MM_FROUND_TO_POS_INF     // round up\
///    _MM_FROUND_TO_ZERO        // truncate\
///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_pd&expand=4774)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_maskz_roundscale_pd<const IMM8: i32>(k: __mmask8, a: __m512d) -> __m512d {
    static_assert_imm8!(IMM8);
    let a = a.as_f64x8();
    let zero = _mm512_setzero_pd().as_f64x8();
    let r = vrndscalepd(a, IMM8, zero, k, _MM_FROUND_CUR_DIRECTION);
    transmute(r)
}

/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.\
/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
///    _MM_FROUND_TO_NEG_INF     // round down\
///    _MM_FROUND_TO_POS_INF     // round up\
///    _MM_FROUND_TO_ZERO        // truncate\
///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_roundscale_pd&expand=4772)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm256_roundscale_pd<const IMM8: i32>(a: __m256d) -> __m256d {
    static_assert_imm8!(IMM8);
    let a = a.as_f64x4();
    let zero = _mm256_setzero_pd().as_f64x4();
    let r = vrndscalepd256(a, IMM8, zero, 0b00001111);
    transmute(r)
}

/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
///    _MM_FROUND_TO_NEG_INF     // round down\
///    _MM_FROUND_TO_POS_INF     // round up\
///    _MM_FROUND_TO_ZERO        // truncate\
///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_roundscale_pd&expand=4770)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm256_mask_roundscale_pd<const IMM8: i32>(
    src: __m256d,
    k: __mmask8,
    a: __m256d,
) -> __m256d {
    static_assert_imm8!(IMM8);
    let a = a.as_f64x4();
    let src = src.as_f64x4();
    let r = vrndscalepd256(a, IMM8, src, k);
    transmute(r)
}

/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
///    _MM_FROUND_TO_NEG_INF     // round down\
///    _MM_FROUND_TO_POS_INF     // round up\
///    _MM_FROUND_TO_ZERO        // truncate\
///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_roundscale_pd&expand=4771)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm256_maskz_roundscale_pd<const IMM8: i32>(k: __mmask8, a: __m256d) -> __m256d {
    static_assert_imm8!(IMM8);
    let a = a.as_f64x4();
    let zero = _mm256_setzero_pd().as_f64x4();
    let r = vrndscalepd256(a, IMM8, zero, k);
    transmute(r)
}

/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.\
/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
///    _MM_FROUND_TO_NEG_INF     // round down\
///    _MM_FROUND_TO_POS_INF     // round up\
///    _MM_FROUND_TO_ZERO        // truncate\
///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_roundscale_pd&expand=4769)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm_roundscale_pd<const IMM8: i32>(a: __m128d) -> __m128d {
    static_assert_imm8!(IMM8);
    let a = a.as_f64x2();
    let zero = _mm_setzero_pd().as_f64x2();
    let r = vrndscalepd128(a, IMM8, zero, 0b00000011);
    transmute(r)
}

/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
///    _MM_FROUND_TO_NEG_INF     // round down\
///    _MM_FROUND_TO_POS_INF     // round up\
///    _MM_FROUND_TO_ZERO        // truncate\
///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_roundscale_pd&expand=4767)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm_mask_roundscale_pd<const IMM8: i32>(
    src: __m128d,
    k: __mmask8,
    a: __m128d,
) -> __m128d {
    static_assert_imm8!(IMM8);
    let a = a.as_f64x2();
    let src = src.as_f64x2();
    let r = vrndscalepd128(a, IMM8, src, k);
    transmute(r)
}

/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
///    _MM_FROUND_TO_NEG_INF     // round down\
///    _MM_FROUND_TO_POS_INF     // round up\
///    _MM_FROUND_TO_ZERO        // truncate\
///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_roundscale_pd&expand=4768)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm_maskz_roundscale_pd<const IMM8: i32>(k: __mmask8, a: __m128d) -> __m128d {
    static_assert_imm8!(IMM8);
    let a = a.as_f64x2();
    let zero = _mm_setzero_pd().as_f64x2();
    let r = vrndscalepd128(a, IMM8, zero, k);
    transmute(r)
}

/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_ps&expand=4883)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vscalefps))]
pub unsafe fn _mm512_scalef_ps(a: __m512, b: __m512) -> __m512 {
    transmute(vscalefps(
        a.as_f32x16(),
        b.as_f32x16(),
        _mm512_setzero_ps().as_f32x16(),
        0b11111111_11111111,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_ps&expand=4881)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vscalefps))]
pub unsafe fn _mm512_mask_scalef_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
    transmute(vscalefps(
        a.as_f32x16(),
        b.as_f32x16(),
        src.as_f32x16(),
        k,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_ps&expand=4882)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vscalefps))]
pub unsafe fn _mm512_maskz_scalef_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
    transmute(vscalefps(
        a.as_f32x16(),
        b.as_f32x16(),
        _mm512_setzero_ps().as_f32x16(),
        k,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_scalef_ps&expand=4880)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vscalefps))]
pub unsafe fn _mm256_scalef_ps(a: __m256, b: __m256) -> __m256 {
    transmute(vscalefps256(
        a.as_f32x8(),
        b.as_f32x8(),
        _mm256_setzero_ps().as_f32x8(),
        0b11111111,
    ))
}

/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_scalef_ps&expand=4878)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vscalefps))]
pub unsafe fn _mm256_mask_scalef_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
    transmute(vscalefps256(a.as_f32x8(), b.as_f32x8(), src.as_f32x8(), k))
}

/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_scalef_ps&expand=4879)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vscalefps))]
pub unsafe fn _mm256_maskz_scalef_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
    transmute(vscalefps256(
        a.as_f32x8(),
        b.as_f32x8(),
        _mm256_setzero_ps().as_f32x8(),
        k,
    ))
}

/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_ps&expand=4877)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vscalefps))]
pub unsafe fn _mm_scalef_ps(a: __m128, b: __m128) -> __m128 {
    transmute(vscalefps128(
        a.as_f32x4(),
        b.as_f32x4(),
        _mm_setzero_ps().as_f32x4(),
        0b00001111,
    ))
}

/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_ps&expand=4875)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vscalefps))]
pub unsafe fn _mm_mask_scalef_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
    transmute(vscalefps128(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k))
}

/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_ps&expand=4876)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vscalefps))]
pub unsafe fn _mm_maskz_scalef_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
    transmute(vscalefps128(
        a.as_f32x4(),
        b.as_f32x4(),
        _mm_setzero_ps().as_f32x4(),
        k,
    ))
}

/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_pd&expand=4874)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vscalefpd))]
pub unsafe fn _mm512_scalef_pd(a: __m512d, b: __m512d) -> __m512d {
    transmute(vscalefpd(
        a.as_f64x8(),
        b.as_f64x8(),
        _mm512_setzero_pd().as_f64x8(),
        0b11111111,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_pd&expand=4872)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vscalefpd))]
pub unsafe fn _mm512_mask_scalef_pd(src: __m512d, k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
    transmute(vscalefpd(
        a.as_f64x8(),
        b.as_f64x8(),
        src.as_f64x8(),
        k,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_pd&expand=4873)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vscalefpd))]
pub unsafe fn _mm512_maskz_scalef_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
    transmute(vscalefpd(
        a.as_f64x8(),
        b.as_f64x8(),
        _mm512_setzero_pd().as_f64x8(),
        k,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_scalef_pd&expand=4871)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vscalefpd))]
pub unsafe fn _mm256_scalef_pd(a: __m256d, b: __m256d) -> __m256d {
    transmute(vscalefpd256(
        a.as_f64x4(),
        b.as_f64x4(),
        _mm256_setzero_pd().as_f64x4(),
        0b00001111,
    ))
}

/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_scalef_pd&expand=4869)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vscalefpd))]
pub unsafe fn _mm256_mask_scalef_pd(src: __m256d, k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
    transmute(vscalefpd256(a.as_f64x4(), b.as_f64x4(), src.as_f64x4(), k))
}

/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_scalef_pd&expand=4870)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vscalefpd))]
pub unsafe fn _mm256_maskz_scalef_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
    transmute(vscalefpd256(
        a.as_f64x4(),
        b.as_f64x4(),
        _mm256_setzero_pd().as_f64x4(),
        k,
    ))
}

/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_scalef_pd&expand=4868)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vscalefpd))]
pub unsafe fn _mm_scalef_pd(a: __m128d, b: __m128d) -> __m128d {
    transmute(vscalefpd128(
        a.as_f64x2(),
        b.as_f64x2(),
        _mm_setzero_pd().as_f64x2(),
        0b00000011,
    ))
}

/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_scalef_pd&expand=4866)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vscalefpd))]
pub unsafe fn _mm_mask_scalef_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
    transmute(vscalefpd128(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k))
}

/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_scalef_pd&expand=4867)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vscalefpd))]
pub unsafe fn _mm_maskz_scalef_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
    transmute(vscalefpd128(
        a.as_f64x2(),
        b.as_f64x2(),
        _mm_setzero_pd().as_f64x2(),
        k,
    ))
}

/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fixupimm_ps&expand=2499)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_fixupimm_ps<const IMM8: i32>(a: __m512, b: __m512, c: __m512i) -> __m512 {
    static_assert_imm8!(IMM8);
    let a = a.as_f32x16();
    let b = b.as_f32x16();
    let c = c.as_i32x16();
    let r = vfixupimmps(a, b, c, IMM8, 0b11111111_11111111, _MM_FROUND_CUR_DIRECTION);
    transmute(r)
}

/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fixupimm_ps&expand=2500)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask_fixupimm_ps<const IMM8: i32>(
    a: __m512,
    k: __mmask16,
    b: __m512,
    c: __m512i,
) -> __m512 {
    static_assert_imm8!(IMM8);
    let a = a.as_f32x16();
    let b = b.as_f32x16();
    let c = c.as_i32x16();
    let r = vfixupimmps(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
    transmute(r)
}

/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fixupimm_ps&expand=2501)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_maskz_fixupimm_ps<const IMM8: i32>(
    k: __mmask16,
    a: __m512,
    b: __m512,
    c: __m512i,
) -> __m512 {
    static_assert_imm8!(IMM8);
    let a = a.as_f32x16();
    let b = b.as_f32x16();
    let c = c.as_i32x16();
    let r = vfixupimmpsz(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
    transmute(r)
}

/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fixupimm_ps&expand=2496)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm256_fixupimm_ps<const IMM8: i32>(a: __m256, b: __m256, c: __m256i) -> __m256 {
    static_assert_imm8!(IMM8);
    let a = a.as_f32x8();
    let b = b.as_f32x8();
    let c = c.as_i32x8();
    let r = vfixupimmps256(a, b, c, IMM8, 0b11111111);
    transmute(r)
}

/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fixupimm_ps&expand=2497)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm256_mask_fixupimm_ps<const IMM8: i32>(
    a: __m256,
    k: __mmask8,
    b: __m256,
    c: __m256i,
) -> __m256 {
    static_assert_imm8!(IMM8);
    let a = a.as_f32x8();
    let b = b.as_f32x8();
    let c = c.as_i32x8();
    let r = vfixupimmps256(a, b, c, IMM8, k);
    transmute(r)
}

/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fixupimm_ps&expand=2498)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm256_maskz_fixupimm_ps<const IMM8: i32>(
    k: __mmask8,
    a: __m256,
    b: __m256,
    c: __m256i,
) -> __m256 {
    static_assert_imm8!(IMM8);
    let a = a.as_f32x8();
    let b = b.as_f32x8();
    let c = c.as_i32x8();
    let r = vfixupimmpsz256(a, b, c, IMM8, k);
    transmute(r)
}

/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fixupimm_ps&expand=2493)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm_fixupimm_ps<const IMM8: i32>(a: __m128, b: __m128, c: __m128i) -> __m128 {
    static_assert_imm8!(IMM8);
    let a = a.as_f32x4();
    let b = b.as_f32x4();
    let c = c.as_i32x4();
    let r = vfixupimmps128(a, b, c, IMM8, 0b00001111);
    transmute(r)
}

/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fixupimm_ps&expand=2494)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_mask_fixupimm_ps<const IMM8: i32>(
    a: __m128,
    k: __mmask8,
    b: __m128,
    c: __m128i,
) -> __m128 {
    static_assert_imm8!(IMM8);
    let a = a.as_f32x4();
    let b = b.as_f32x4();
    let c = c.as_i32x4();
    let r = vfixupimmps128(a, b, c, IMM8, k);
    transmute(r)
}

/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fixupimm_ps&expand=2495)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_maskz_fixupimm_ps<const IMM8: i32>(
    k: __mmask8,
    a: __m128,
    b: __m128,
    c: __m128i,
) -> __m128 {
    static_assert_imm8!(IMM8);
    let a = a.as_f32x4();
    let b = b.as_f32x4();
    let c = c.as_i32x4();
    let r = vfixupimmpsz128(a, b, c, IMM8, k);
    transmute(r)
}

/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fixupimm_pd&expand=2490)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_fixupimm_pd<const IMM8: i32>(a: __m512d, b: __m512d, c: __m512i) -> __m512d {
    static_assert_imm8!(IMM8);
    let a = a.as_f64x8();
    let b = b.as_f64x8();
    let c = c.as_i64x8();
    let r = vfixupimmpd(a, b, c, IMM8, 0b11111111, _MM_FROUND_CUR_DIRECTION);
    transmute(r)
}

/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fixupimm_pd&expand=2491)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask_fixupimm_pd<const IMM8: i32>(
    a: __m512d,
    k: __mmask8,
    b: __m512d,
    c: __m512i,
) -> __m512d {
    static_assert_imm8!(IMM8);
    let a = a.as_f64x8();
    let b = b.as_f64x8();
    let c = c.as_i64x8();
    let r = vfixupimmpd(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
    transmute(r)
}

/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fixupimm_pd&expand=2492)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_maskz_fixupimm_pd<const IMM8: i32>(
    k: __mmask8,
    a: __m512d,
    b: __m512d,
    c: __m512i,
) -> __m512d {
    static_assert_imm8!(IMM8);
    let a = a.as_f64x8();
    let b = b.as_f64x8();
    let c = c.as_i64x8();
    let r = vfixupimmpdz(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
    transmute(r)
}

/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_fixupimm_pd&expand=2487)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm256_fixupimm_pd<const IMM8: i32>(a: __m256d, b: __m256d, c: __m256i) -> __m256d {
    static_assert_imm8!(IMM8);
    let a = a.as_f64x4();
    let b = b.as_f64x4();
    let c = c.as_i64x4();
    let r = vfixupimmpd256(a, b, c, IMM8, 0b00001111);
    transmute(r)
}

/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_fixupimm_pd&expand=2488)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm256_mask_fixupimm_pd<const IMM8: i32>(
    a: __m256d,
    k: __mmask8,
    b: __m256d,
    c: __m256i,
) -> __m256d {
    static_assert_imm8!(IMM8);
    let a = a.as_f64x4();
    let b = b.as_f64x4();
    let c = c.as_i64x4();
    let r = vfixupimmpd256(a, b, c, IMM8, k);
    transmute(r)
}

/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_fixupimm_pd&expand=2489)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm256_maskz_fixupimm_pd<const IMM8: i32>(
    k: __mmask8,
    a: __m256d,
    b: __m256d,
    c: __m256i,
) -> __m256d {
    static_assert_imm8!(IMM8);
    let a = a.as_f64x4();
    let b = b.as_f64x4();
    let c = c.as_i64x4();
    let r = vfixupimmpdz256(a, b, c, IMM8, k);
    transmute(r)
}

/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fixupimm_pd&expand=2484)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm_fixupimm_pd<const IMM8: i32>(a: __m128d, b: __m128d, c: __m128i) -> __m128d {
    static_assert_imm8!(IMM8);
    let a = a.as_f64x2();
    let b = b.as_f64x2();
    let c = c.as_i64x2();
    let r = vfixupimmpd128(a, b, c, IMM8, 0b00000011);
    transmute(r)
}

/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_fixupimm_pd&expand=2485)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_mask_fixupimm_pd<const IMM8: i32>(
    a: __m128d,
    k: __mmask8,
    b: __m128d,
    c: __m128i,
) -> __m128d {
    static_assert_imm8!(IMM8);
    let a = a.as_f64x2();
    let b = b.as_f64x2();
    let c = c.as_i64x2();
    let r = vfixupimmpd128(a, b, c, IMM8, k);
    transmute(r)
}

/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_fixupimm_pd&expand=2486)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_maskz_fixupimm_pd<const IMM8: i32>(
    k: __mmask8,
    a: __m128d,
    b: __m128d,
    c: __m128i,
) -> __m128d {
    static_assert_imm8!(IMM8);
    let a = a.as_f64x2();
    let b = b.as_f64x2();
    let c = c.as_i64x2();
    let r = vfixupimmpdz128(a, b, c, IMM8, k);
    transmute(r)
}

/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_ternarylogic_epi32&expand=5867)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpternlogd, IMM8 = 114))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_ternarylogic_epi32<const IMM8: i32>(
    a: __m512i,
    b: __m512i,
    c: __m512i,
) -> __m512i {
    static_assert_imm8!(IMM8);
    let a = a.as_i32x16();
    let b = b.as_i32x16();
    let c = c.as_i32x16();
    let r = vpternlogd(a, b, c, IMM8);
    transmute(r)
}

/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from src, a, and b are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using writemask k at 32-bit granularity (32-bit elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_ternarylogic_epi32&expand=5865)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpternlogd, IMM8 = 114))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask_ternarylogic_epi32<const IMM8: i32>(
    src: __m512i,
    k: __mmask16,
    a: __m512i,
    b: __m512i,
) -> __m512i {
    static_assert_imm8!(IMM8);
    let src = src.as_i32x16();
    let a = a.as_i32x16();
    let b = b.as_i32x16();
    let r = vpternlogd(src, a, b, IMM8);
    transmute(simd_select_bitmask(k, r, src))
}

/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using zeromask k at 32-bit granularity (32-bit elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_ternarylogic_epi32&expand=5866)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpternlogd, IMM8 = 114))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_maskz_ternarylogic_epi32<const IMM8: i32>(
    k: __mmask16,
    a: __m512i,
    b: __m512i,
    c: __m512i,
) -> __m512i {
    static_assert_imm8!(IMM8);
    let a = a.as_i32x16();
    let b = b.as_i32x16();
    let c = c.as_i32x16();
    let r = vpternlogd(a, b, c, IMM8);
    let zero = _mm512_setzero_si512().as_i32x16();
    transmute(simd_select_bitmask(k, r, zero))
}

/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_ternarylogic_epi32&expand=5864)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpternlogd, IMM8 = 114))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm256_ternarylogic_epi32<const IMM8: i32>(
    a: __m256i,
    b: __m256i,
    c: __m256i,
) -> __m256i {
    static_assert_imm8!(IMM8);
    let a = a.as_i32x8();
    let b = b.as_i32x8();
    let c = c.as_i32x8();
    let r = vpternlogd256(a, b, c, IMM8);
    transmute(r)
}

/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from src, a, and b are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using writemask k at 32-bit granularity (32-bit elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_ternarylogic_epi32&expand=5862)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpternlogd, IMM8 = 114))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm256_mask_ternarylogic_epi32<const IMM8: i32>(
    src: __m256i,
    k: __mmask8,
    a: __m256i,
    b: __m256i,
) -> __m256i {
    static_assert_imm8!(IMM8);
    let src = src.as_i32x8();
    let a = a.as_i32x8();
    let b = b.as_i32x8();
    let r = vpternlogd256(src, a, b, IMM8);
    transmute(simd_select_bitmask(k, r, src))
}

/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using zeromask k at 32-bit granularity (32-bit elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_ternarylogic_epi32&expand=5863)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpternlogd, IMM8 = 114))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm256_maskz_ternarylogic_epi32<const IMM8: i32>(
    k: __mmask8,
    a: __m256i,
    b: __m256i,
    c: __m256i,
) -> __m256i {
    static_assert_imm8!(IMM8);
    let a = a.as_i32x8();
    let b = b.as_i32x8();
    let c = c.as_i32x8();
    let r = vpternlogd256(a, b, c, IMM8);
    let zero = _mm256_setzero_si256().as_i32x8();
    transmute(simd_select_bitmask(k, r, zero))
}

/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ternarylogic_epi32&expand=5861)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpternlogd, IMM8 = 114))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm_ternarylogic_epi32<const IMM8: i32>(
    a: __m128i,
    b: __m128i,
    c: __m128i,
) -> __m128i {
    static_assert_imm8!(IMM8);
    let a = a.as_i32x4();
    let b = b.as_i32x4();
    let c = c.as_i32x4();
    let r = vpternlogd128(a, b, c, IMM8);
    transmute(r)
}

/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from src, a, and b are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using writemask k at 32-bit granularity (32-bit elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_ternarylogic_epi32&expand=5859)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpternlogd, IMM8 = 114))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_mask_ternarylogic_epi32<const IMM8: i32>(
    src: __m128i,
    k: __mmask8,
    a: __m128i,
    b: __m128i,
) -> __m128i {
    static_assert_imm8!(IMM8);
    let src = src.as_i32x4();
    let a = a.as_i32x4();
    let b = b.as_i32x4();
    let r = vpternlogd128(src, a, b, IMM8);
    transmute(simd_select_bitmask(k, r, src))
}

/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 32-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using zeromask k at 32-bit granularity (32-bit elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_ternarylogic_epi32&expand=5860)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpternlogd, IMM8 = 114))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_maskz_ternarylogic_epi32<const IMM8: i32>(
    k: __mmask8,
    a: __m128i,
    b: __m128i,
    c: __m128i,
) -> __m128i {
    static_assert_imm8!(IMM8);
    let a = a.as_i32x4();
    let b = b.as_i32x4();
    let c = c.as_i32x4();
    let r = vpternlogd128(a, b, c, IMM8);
    let zero = _mm_setzero_si128().as_i32x4();
    transmute(simd_select_bitmask(k, r, zero))
}

/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_ternarylogic_epi64&expand=5876)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpternlogq, IMM8 = 114))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_ternarylogic_epi64<const IMM8: i32>(
    a: __m512i,
    b: __m512i,
    c: __m512i,
) -> __m512i {
    static_assert_imm8!(IMM8);
    let a = a.as_i64x8();
    let b = b.as_i64x8();
    let c = c.as_i64x8();
    let r = vpternlogq(a, b, c, IMM8);
    transmute(r)
}

/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from src, a, and b are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using writemask k at 64-bit granularity (64-bit elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_ternarylogic_epi64&expand=5874)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpternlogq, IMM8 = 114))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask_ternarylogic_epi64<const IMM8: i32>(
    src: __m512i,
    k: __mmask8,
    a: __m512i,
    b: __m512i,
) -> __m512i {
    static_assert_imm8!(IMM8);
    let src = src.as_i64x8();
    let a = a.as_i64x8();
    let b = b.as_i64x8();
    let r = vpternlogq(src, a, b, IMM8);
    transmute(simd_select_bitmask(k, r, src))
}

/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using zeromask k at 64-bit granularity (64-bit elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_ternarylogic_epi64&expand=5875)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpternlogq, IMM8 = 114))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_maskz_ternarylogic_epi64<const IMM8: i32>(
    k: __mmask8,
    a: __m512i,
    b: __m512i,
    c: __m512i,
) -> __m512i {
    static_assert_imm8!(IMM8);
    let a = a.as_i64x8();
    let b = b.as_i64x8();
    let c = c.as_i64x8();
    let r = vpternlogq(a, b, c, IMM8);
    let zero = _mm512_setzero_si512().as_i64x8();
    transmute(simd_select_bitmask(k, r, zero))
}

/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_ternarylogic_epi64&expand=5873)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpternlogq, IMM8 = 114))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm256_ternarylogic_epi64<const IMM8: i32>(
    a: __m256i,
    b: __m256i,
    c: __m256i,
) -> __m256i {
    static_assert_imm8!(IMM8);
    let a = a.as_i64x4();
    let b = b.as_i64x4();
    let c = c.as_i64x4();
    let r = vpternlogq256(a, b, c, IMM8);
    transmute(r)
}

/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from src, a, and b are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using writemask k at 64-bit granularity (64-bit elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_ternarylogic_epi64&expand=5871)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpternlogq, IMM8 = 114))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm256_mask_ternarylogic_epi64<const IMM8: i32>(
    src: __m256i,
    k: __mmask8,
    a: __m256i,
    b: __m256i,
) -> __m256i {
    static_assert_imm8!(IMM8);
    let src = src.as_i64x4();
    let a = a.as_i64x4();
    let b = b.as_i64x4();
    let r = vpternlogq256(src, a, b, IMM8);
    transmute(simd_select_bitmask(k, r, src))
}

/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using zeromask k at 64-bit granularity (64-bit elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_ternarylogic_epi64&expand=5872)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpternlogq, IMM8 = 114))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm256_maskz_ternarylogic_epi64<const IMM8: i32>(
    k: __mmask8,
    a: __m256i,
    b: __m256i,
    c: __m256i,
) -> __m256i {
    static_assert_imm8!(IMM8);
    let a = a.as_i64x4();
    let b = b.as_i64x4();
    let c = c.as_i64x4();
    let r = vpternlogq256(a, b, c, IMM8);
    let zero = _mm256_setzero_si256().as_i64x4();
    transmute(simd_select_bitmask(k, r, zero))
}

/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ternarylogic_epi64&expand=5870)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpternlogq, IMM8 = 114))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm_ternarylogic_epi64<const IMM8: i32>(
    a: __m128i,
    b: __m128i,
    c: __m128i,
) -> __m128i {
    static_assert_imm8!(IMM8);
    let a = a.as_i64x2();
    let b = b.as_i64x2();
    let c = c.as_i64x2();
    let r = vpternlogq128(a, b, c, IMM8);
    transmute(r)
}

/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from src, a, and b are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using writemask k at 64-bit granularity (64-bit elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_ternarylogic_epi64&expand=5868)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpternlogq, IMM8 = 114))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_mask_ternarylogic_epi64<const IMM8: i32>(
    src: __m128i,
    k: __mmask8,
    a: __m128i,
    b: __m128i,
) -> __m128i {
    static_assert_imm8!(IMM8);
    let src = src.as_i64x2();
    let a = a.as_i64x2();
    let b = b.as_i64x2();
    let r = vpternlogq128(src, a, b, IMM8);
    transmute(simd_select_bitmask(k, r, src))
}

/// Bitwise ternary logic that provides the capability to implement any three-operand binary function; the specific binary function is specified by value in imm8. For each bit in each packed 64-bit integer, the corresponding bit from a, b, and c are used to form a 3 bit index into imm8, and the value at that bit in imm8 is written to the corresponding bit in dst using zeromask k at 64-bit granularity (64-bit elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_ternarylogic_epi64&expand=5869)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpternlogq, IMM8 = 114))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_maskz_ternarylogic_epi64<const IMM8: i32>(
    k: __mmask8,
    a: __m128i,
    b: __m128i,
    c: __m128i,
) -> __m128i {
    static_assert_imm8!(IMM8);
    let a = a.as_i64x2();
    let b = b.as_i64x2();
    let c = c.as_i64x2();
    let r = vpternlogq128(a, b, c, IMM8);
    let zero = _mm_setzero_si128().as_i64x2();
    transmute(simd_select_bitmask(k, r, zero))
}

/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
/// The mantissa is normalized to the interval specified by interv, which can take the following values:
///    _MM_MANT_NORM_1_2     // interval [1, 2)
///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
/// The sign is determined by sc which can take the following values:
///    _MM_MANT_SIGN_src     // sign = sign(src)
///    _MM_MANT_SIGN_zero    // sign = 0
///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getmant_ps&expand=2880)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0))]
#[rustc_legacy_const_generics(1, 2)]
pub unsafe fn _mm512_getmant_ps<
    const NORM: _MM_MANTISSA_NORM_ENUM,
    const SIGN: _MM_MANTISSA_SIGN_ENUM,
>(
    a: __m512,
) -> __m512 {
    static_assert_imm4!(NORM);
    static_assert_imm2!(SIGN);
    let a = a.as_f32x16();
    let zero = _mm512_setzero_ps().as_f32x16();
    let r = vgetmantps(
        a,
        SIGN << 2 | NORM,
        zero,
        0b11111111_11111111,
        _MM_FROUND_CUR_DIRECTION,
    );
    transmute(r)
}

/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
///    _MM_MANT_NORM_1_2     // interval [1, 2)\
///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
/// The sign is determined by sc which can take the following values:\
///    _MM_MANT_SIGN_src     // sign = sign(src)\
///    _MM_MANT_SIGN_zero    // sign = 0\
///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getmant_ps&expand=2881)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0))]
#[rustc_legacy_const_generics(3, 4)]
pub unsafe fn _mm512_mask_getmant_ps<
    const NORM: _MM_MANTISSA_NORM_ENUM,
    const SIGN: _MM_MANTISSA_SIGN_ENUM,
>(
    src: __m512,
    k: __mmask16,
    a: __m512,
) -> __m512 {
    static_assert_imm4!(NORM);
    static_assert_imm2!(SIGN);
    let a = a.as_f32x16();
    let src = src.as_f32x16();
    let r = vgetmantps(a, SIGN << 2 | NORM, src, k, _MM_FROUND_CUR_DIRECTION);
    transmute(r)
}

/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
///    _MM_MANT_NORM_1_2     // interval [1, 2)\
///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
/// The sign is determined by sc which can take the following values:\
///    _MM_MANT_SIGN_src     // sign = sign(src)\
///    _MM_MANT_SIGN_zero    // sign = 0\
///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getmant_ps&expand=2882)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0))]
#[rustc_legacy_const_generics(2, 3)]
pub unsafe fn _mm512_maskz_getmant_ps<
    const NORM: _MM_MANTISSA_NORM_ENUM,
    const SIGN: _MM_MANTISSA_SIGN_ENUM,
>(
    k: __mmask16,
    a: __m512,
) -> __m512 {
    static_assert_imm4!(NORM);
    static_assert_imm2!(SIGN);
    let a = a.as_f32x16();
    let zero = _mm512_setzero_ps().as_f32x16();
    let r = vgetmantps(a, SIGN << 2 | NORM, zero, k, _MM_FROUND_CUR_DIRECTION);
    transmute(r)
}

/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
/// The mantissa is normalized to the interval specified by interv, which can take the following values:
///    _MM_MANT_NORM_1_2     // interval [1, 2)
///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
/// The sign is determined by sc which can take the following values:
///    _MM_MANT_SIGN_src     // sign = sign(src)
///    _MM_MANT_SIGN_zero    // sign = 0
///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_getmant_ps&expand=2877)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0))]
#[rustc_legacy_const_generics(1, 2)]
pub unsafe fn _mm256_getmant_ps<
    const NORM: _MM_MANTISSA_NORM_ENUM,
    const SIGN: _MM_MANTISSA_SIGN_ENUM,
>(
    a: __m256,
) -> __m256 {
    static_assert_imm4!(NORM);
    static_assert_imm2!(SIGN);
    let a = a.as_f32x8();
    let zero = _mm256_setzero_ps().as_f32x8();
    let r = vgetmantps256(a, SIGN << 2 | NORM, zero, 0b11111111);
    transmute(r)
}

/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
///    _MM_MANT_NORM_1_2     // interval [1, 2)\
///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
/// The sign is determined by sc which can take the following values:\
///    _MM_MANT_SIGN_src     // sign = sign(src)\
///    _MM_MANT_SIGN_zero    // sign = 0\
///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_getmant_ps&expand=2878)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0))]
#[rustc_legacy_const_generics(3, 4)]
pub unsafe fn _mm256_mask_getmant_ps<
    const NORM: _MM_MANTISSA_NORM_ENUM,
    const SIGN: _MM_MANTISSA_SIGN_ENUM,
>(
    src: __m256,
    k: __mmask8,
    a: __m256,
) -> __m256 {
    static_assert_imm4!(NORM);
    static_assert_imm2!(SIGN);
    let a = a.as_f32x8();
    let src = src.as_f32x8();
    let r = vgetmantps256(a, SIGN << 2 | NORM, src, k);
    transmute(r)
}

/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
///    _MM_MANT_NORM_1_2     // interval [1, 2)\
///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
/// The sign is determined by sc which can take the following values:\
///    _MM_MANT_SIGN_src     // sign = sign(src)\
///    _MM_MANT_SIGN_zero    // sign = 0\
///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_getmant_ps&expand=2879)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0))]
#[rustc_legacy_const_generics(2, 3)]
pub unsafe fn _mm256_maskz_getmant_ps<
    const NORM: _MM_MANTISSA_NORM_ENUM,
    const SIGN: _MM_MANTISSA_SIGN_ENUM,
>(
    k: __mmask8,
    a: __m256,
) -> __m256 {
    static_assert_imm4!(NORM);
    static_assert_imm2!(SIGN);
    let a = a.as_f32x8();
    let zero = _mm256_setzero_ps().as_f32x8();
    let r = vgetmantps256(a, SIGN << 2 | NORM, zero, k);
    transmute(r)
}

/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.
/// The mantissa is normalized to the interval specified by interv, which can take the following values:
///    _MM_MANT_NORM_1_2     // interval [1, 2)
///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)
///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)
///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)
/// The sign is determined by sc which can take the following values:
///    _MM_MANT_SIGN_src     // sign = sign(src)
///    _MM_MANT_SIGN_zero    // sign = 0
///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_ps&expand=2874)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0))]
#[rustc_legacy_const_generics(1, 2)]
pub unsafe fn _mm_getmant_ps<
    const NORM: _MM_MANTISSA_NORM_ENUM,
    const SIGN: _MM_MANTISSA_SIGN_ENUM,
>(
    a: __m128,
) -> __m128 {
    static_assert_imm4!(NORM);
    static_assert_imm2!(SIGN);
    let a = a.as_f32x4();
    let zero = _mm_setzero_ps().as_f32x4();
    let r = vgetmantps128(a, SIGN << 2 | NORM, zero, 0b00001111);
    transmute(r)
}

/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
///    _MM_MANT_NORM_1_2     // interval [1, 2)\
///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
/// The sign is determined by sc which can take the following values:\
///    _MM_MANT_SIGN_src     // sign = sign(src)\
///    _MM_MANT_SIGN_zero    // sign = 0\
///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_ps&expand=2875)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0))]
#[rustc_legacy_const_generics(3, 4)]
pub unsafe fn _mm_mask_getmant_ps<
    const NORM: _MM_MANTISSA_NORM_ENUM,
    const SIGN: _MM_MANTISSA_SIGN_ENUM,
>(
    src: __m128,
    k: __mmask8,
    a: __m128,
) -> __m128 {
    static_assert_imm4!(NORM);
    static_assert_imm2!(SIGN);
    let a = a.as_f32x4();
    let src = src.as_f32x4();
    let r = vgetmantps128(a, SIGN << 2 | NORM, src, k);
    transmute(r)
}

/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
///    _MM_MANT_NORM_1_2     // interval [1, 2)\
///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
/// The sign is determined by sc which can take the following values:\
///    _MM_MANT_SIGN_src     // sign = sign(src)\
///    _MM_MANT_SIGN_zero    // sign = 0\
///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_ps&expand=2876)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0))]
#[rustc_legacy_const_generics(2, 3)]
pub unsafe fn _mm_maskz_getmant_ps<
    const NORM: _MM_MANTISSA_NORM_ENUM,
    const SIGN: _MM_MANTISSA_SIGN_ENUM,
>(
    k: __mmask8,
    a: __m128,
) -> __m128 {
    static_assert_imm4!(NORM);
    static_assert_imm2!(SIGN);
    let a = a.as_f32x4();
    let zero = _mm_setzero_ps().as_f32x4();
    let r = vgetmantps128(a, SIGN << 2 | NORM, zero, k);
    transmute(r)
}

/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
///    _MM_MANT_NORM_1_2     // interval [1, 2)\
///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
/// The sign is determined by sc which can take the following values:\
///    _MM_MANT_SIGN_src     // sign = sign(src)\
///    _MM_MANT_SIGN_zero    // sign = 0\
///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getmant_pd&expand=2871)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0))]
#[rustc_legacy_const_generics(1, 2)]
pub unsafe fn _mm512_getmant_pd<
    const NORM: _MM_MANTISSA_NORM_ENUM,
    const SIGN: _MM_MANTISSA_SIGN_ENUM,
>(
    a: __m512d,
) -> __m512d {
    static_assert_imm4!(NORM);
    static_assert_imm2!(SIGN);
    let a = a.as_f64x8();
    let zero = _mm512_setzero_pd().as_f64x8();
    let r = vgetmantpd(
        a,
        SIGN << 2 | NORM,
        zero,
        0b11111111,
        _MM_FROUND_CUR_DIRECTION,
    );
    transmute(r)
}

/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
///    _MM_MANT_NORM_1_2     // interval [1, 2)\
///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
/// The sign is determined by sc which can take the following values:\
///    _MM_MANT_SIGN_src     // sign = sign(src)\
///    _MM_MANT_SIGN_zero    // sign = 0\
///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getmant_pd&expand=2872)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0))]
#[rustc_legacy_const_generics(3, 4)]
pub unsafe fn _mm512_mask_getmant_pd<
    const NORM: _MM_MANTISSA_NORM_ENUM,
    const SIGN: _MM_MANTISSA_SIGN_ENUM,
>(
    src: __m512d,
    k: __mmask8,
    a: __m512d,
) -> __m512d {
    static_assert_imm4!(NORM);
    static_assert_imm2!(SIGN);
    let a = a.as_f64x8();
    let src = src.as_f64x8();
    let r = vgetmantpd(a, SIGN << 2 | NORM, src, k, _MM_FROUND_CUR_DIRECTION);
    transmute(r)
}

/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
///    _MM_MANT_NORM_1_2     // interval [1, 2)\
///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
/// The sign is determined by sc which can take the following values:\
///    _MM_MANT_SIGN_src     // sign = sign(src)\
///    _MM_MANT_SIGN_zero    // sign = 0\
///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getmant_pd&expand=2873)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0))]
#[rustc_legacy_const_generics(2, 3)]
pub unsafe fn _mm512_maskz_getmant_pd<
    const NORM: _MM_MANTISSA_NORM_ENUM,
    const SIGN: _MM_MANTISSA_SIGN_ENUM,
>(
    k: __mmask8,
    a: __m512d,
) -> __m512d {
    static_assert_imm4!(NORM);
    static_assert_imm2!(SIGN);
    let a = a.as_f64x8();
    let zero = _mm512_setzero_pd().as_f64x8();
    let r = vgetmantpd(a, SIGN << 2 | NORM, zero, k, _MM_FROUND_CUR_DIRECTION);
    transmute(r)
}

/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
///    _MM_MANT_NORM_1_2     // interval [1, 2)\
///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
/// The sign is determined by sc which can take the following values:\
///    _MM_MANT_SIGN_src     // sign = sign(src)\
///    _MM_MANT_SIGN_zero    // sign = 0\
///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_getmant_pd&expand=2868)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0))]
#[rustc_legacy_const_generics(1, 2)]
pub unsafe fn _mm256_getmant_pd<
    const NORM: _MM_MANTISSA_NORM_ENUM,
    const SIGN: _MM_MANTISSA_SIGN_ENUM,
>(
    a: __m256d,
) -> __m256d {
    static_assert_imm4!(NORM);
    static_assert_imm2!(SIGN);
    let a = a.as_f64x4();
    let zero = _mm256_setzero_pd().as_f64x4();
    let r = vgetmantpd256(a, SIGN << 2 | NORM, zero, 0b00001111);
    transmute(r)
}

/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
///    _MM_MANT_NORM_1_2     // interval [1, 2)\
///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
/// The sign is determined by sc which can take the following values:\
///    _MM_MANT_SIGN_src     // sign = sign(src)\
///    _MM_MANT_SIGN_zero    // sign = 0\
///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_getmant_pd&expand=2869)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0))]
#[rustc_legacy_const_generics(3, 4)]
pub unsafe fn _mm256_mask_getmant_pd<
    const NORM: _MM_MANTISSA_NORM_ENUM,
    const SIGN: _MM_MANTISSA_SIGN_ENUM,
>(
    src: __m256d,
    k: __mmask8,
    a: __m256d,
) -> __m256d {
    static_assert_imm4!(NORM);
    static_assert_imm2!(SIGN);
    let a = a.as_f64x4();
    let src = src.as_f64x4();
    let r = vgetmantpd256(a, SIGN << 2 | NORM, src, k);
    transmute(r)
}

/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
///    _MM_MANT_NORM_1_2     // interval [1, 2)\
///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
/// The sign is determined by sc which can take the following values:\
///    _MM_MANT_SIGN_src     // sign = sign(src)\
///    _MM_MANT_SIGN_zero    // sign = 0\
///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_getmant_pd&expand=2870)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0))]
#[rustc_legacy_const_generics(2, 3)]
pub unsafe fn _mm256_maskz_getmant_pd<
    const NORM: _MM_MANTISSA_NORM_ENUM,
    const SIGN: _MM_MANTISSA_SIGN_ENUM,
>(
    k: __mmask8,
    a: __m256d,
) -> __m256d {
    static_assert_imm4!(NORM);
    static_assert_imm2!(SIGN);
    let a = a.as_f64x4();
    let zero = _mm256_setzero_pd().as_f64x4();
    let r = vgetmantpd256(a, SIGN << 2 | NORM, zero, k);
    transmute(r)
}

/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
///    _MM_MANT_NORM_1_2     // interval [1, 2)\
///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
/// The sign is determined by sc which can take the following values:\
///    _MM_MANT_SIGN_src     // sign = sign(src)\
///    _MM_MANT_SIGN_zero    // sign = 0\
///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_getmant_pd&expand=2865)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0))]
#[rustc_legacy_const_generics(1, 2)]
pub unsafe fn _mm_getmant_pd<
    const NORM: _MM_MANTISSA_NORM_ENUM,
    const SIGN: _MM_MANTISSA_SIGN_ENUM,
>(
    a: __m128d,
) -> __m128d {
    static_assert_imm4!(NORM);
    static_assert_imm2!(SIGN);
    let a = a.as_f64x2();
    let zero = _mm_setzero_pd().as_f64x2();
    let r = vgetmantpd128(a, SIGN << 2 | NORM, zero, 0b00000011);
    transmute(r)
}

/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
///    _MM_MANT_NORM_1_2     // interval [1, 2)\
///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
/// The sign is determined by sc which can take the following values:\
///    _MM_MANT_SIGN_src     // sign = sign(src)\
///    _MM_MANT_SIGN_zero    // sign = 0\
///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_getmant_pd&expand=2866)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0))]
#[rustc_legacy_const_generics(3, 4)]
pub unsafe fn _mm_mask_getmant_pd<
    const NORM: _MM_MANTISSA_NORM_ENUM,
    const SIGN: _MM_MANTISSA_SIGN_ENUM,
>(
    src: __m128d,
    k: __mmask8,
    a: __m128d,
) -> __m128d {
    static_assert_imm4!(NORM);
    static_assert_imm2!(SIGN);
    let a = a.as_f64x2();
    let src = src.as_f64x2();
    let r = vgetmantpd128(a, SIGN << 2 | NORM, src, k);
    transmute(r)
}

/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
///    _MM_MANT_NORM_1_2     // interval [1, 2)\
///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
/// The sign is determined by sc which can take the following values:\
///    _MM_MANT_SIGN_src     // sign = sign(src)\
///    _MM_MANT_SIGN_zero    // sign = 0\
///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_getmant_pd&expand=2867)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0))]
#[rustc_legacy_const_generics(2, 3)]
pub unsafe fn _mm_maskz_getmant_pd<
    const NORM: _MM_MANTISSA_NORM_ENUM,
    const SIGN: _MM_MANTISSA_SIGN_ENUM,
>(
    k: __mmask8,
    a: __m128d,
) -> __m128d {
    static_assert_imm4!(NORM);
    static_assert_imm2!(SIGN);
    let a = a.as_f64x2();
    let zero = _mm_setzero_pd().as_f64x2();
    let r = vgetmantpd128(a, SIGN << 2 | NORM, zero, k);
    transmute(r)
}

/// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_add_round_ps&expand=145)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vaddps, ROUNDING = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_add_round_ps<const ROUNDING: i32>(a: __m512, b: __m512) -> __m512 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x16();
    let b = b.as_f32x16();
    let r = vaddps(a, b, ROUNDING);
    transmute(r)
}

/// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_add_round_ps&expand=146)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vaddps, ROUNDING = 8))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask_add_round_ps<const ROUNDING: i32>(
    src: __m512,
    k: __mmask16,
    a: __m512,
    b: __m512,
) -> __m512 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x16();
    let b = b.as_f32x16();
    let r = vaddps(a, b, ROUNDING);
    transmute(simd_select_bitmask(k, r, src.as_f32x16()))
}

/// Add packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_add_round_ps&expand=147)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vaddps, ROUNDING = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_maskz_add_round_ps<const ROUNDING: i32>(
    k: __mmask16,
    a: __m512,
    b: __m512,
) -> __m512 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x16();
    let b = b.as_f32x16();
    let r = vaddps(a, b, ROUNDING);
    let zero = _mm512_setzero_ps().as_f32x16();
    transmute(simd_select_bitmask(k, r, zero))
}

/// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_add_round_pd&expand=142)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vaddpd, ROUNDING = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_add_round_pd<const ROUNDING: i32>(a: __m512d, b: __m512d) -> __m512d {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f64x8();
    let b = b.as_f64x8();
    let r = vaddpd(a, b, ROUNDING);
    transmute(r)
}

/// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_add_round_pd&expand=143)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vaddpd, ROUNDING = 8))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask_add_round_pd<const ROUNDING: i32>(
    src: __m512d,
    k: __mmask8,
    a: __m512d,
    b: __m512d,
) -> __m512d {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f64x8();
    let b = b.as_f64x8();
    let r = vaddpd(a, b, ROUNDING);
    transmute(simd_select_bitmask(k, r, src.as_f64x8()))
}

/// Add packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_add_round_pd&expand=144)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vaddpd, ROUNDING = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_maskz_add_round_pd<const ROUNDING: i32>(
    k: __mmask8,
    a: __m512d,
    b: __m512d,
) -> __m512d {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f64x8();
    let b = b.as_f64x8();
    let r = vaddpd(a, b, ROUNDING);
    let zero = _mm512_setzero_pd().as_f64x8();
    transmute(simd_select_bitmask(k, r, zero))
}

/// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sub_round_ps&expand=5739)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vsubps, ROUNDING = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_sub_round_ps<const ROUNDING: i32>(a: __m512, b: __m512) -> __m512 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x16();
    let b = b.as_f32x16();
    let r = vsubps(a, b, ROUNDING);
    transmute(r)
}

/// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sub_round_ps&expand=5737)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vsubps, ROUNDING = 8))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask_sub_round_ps<const ROUNDING: i32>(
    src: __m512,
    k: __mmask16,
    a: __m512,
    b: __m512,
) -> __m512 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x16();
    let b = b.as_f32x16();
    let r = vsubps(a, b, ROUNDING);
    transmute(simd_select_bitmask(k, r, src.as_f32x16()))
}

/// Subtract packed single-precision (32-bit) floating-point elements in b from packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sub_round_ps&expand=5738)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vsubps, ROUNDING = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_maskz_sub_round_ps<const ROUNDING: i32>(
    k: __mmask16,
    a: __m512,
    b: __m512,
) -> __m512 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x16();
    let b = b.as_f32x16();
    let r = vsubps(a, b, ROUNDING);
    let zero = _mm512_setzero_ps().as_f32x16();
    transmute(simd_select_bitmask(k, r, zero))
}

/// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sub_round_pd&expand=5736)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vsubpd, ROUNDING = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_sub_round_pd<const ROUNDING: i32>(a: __m512d, b: __m512d) -> __m512d {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f64x8();
    let b = b.as_f64x8();
    let r = vsubpd(a, b, ROUNDING);
    transmute(r)
}

/// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sub_round_pd&expand=5734)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vsubpd, ROUNDING = 8))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask_sub_round_pd<const ROUNDING: i32>(
    src: __m512d,
    k: __mmask8,
    a: __m512d,
    b: __m512d,
) -> __m512d {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f64x8();
    let b = b.as_f64x8();
    let r = vsubpd(a, b, ROUNDING);
    transmute(simd_select_bitmask(k, r, src.as_f64x8()))
}

/// Subtract packed double-precision (64-bit) floating-point elements in b from packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sub_round_pd&expand=5735)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vsubpd, ROUNDING = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_maskz_sub_round_pd<const ROUNDING: i32>(
    k: __mmask8,
    a: __m512d,
    b: __m512d,
) -> __m512d {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f64x8();
    let b = b.as_f64x8();
    let r = vsubpd(a, b, ROUNDING);
    let zero = _mm512_setzero_pd().as_f64x8();
    transmute(simd_select_bitmask(k, r, zero))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_round_ps&expand=3940)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmulps, ROUNDING = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_mul_round_ps<const ROUNDING: i32>(a: __m512, b: __m512) -> __m512 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x16();
    let b = b.as_f32x16();
    let r = vmulps(a, b, ROUNDING);
    transmute(r)
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_round_ps&expand=3938)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmulps, ROUNDING = 8))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask_mul_round_ps<const ROUNDING: i32>(
    src: __m512,
    k: __mmask16,
    a: __m512,
    b: __m512,
) -> __m512 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x16();
    let b = b.as_f32x16();
    let r = vmulps(a, b, ROUNDING);
    transmute(simd_select_bitmask(k, r, src.as_f32x16()))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_round_ps&expand=3939)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmulps, ROUNDING = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_maskz_mul_round_ps<const ROUNDING: i32>(
    k: __mmask16,
    a: __m512,
    b: __m512,
) -> __m512 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x16();
    let b = b.as_f32x16();
    let r = vmulps(a, b, ROUNDING);
    let zero = _mm512_setzero_ps().as_f32x16();
    transmute(simd_select_bitmask(k, r, zero))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mul_round_pd&expand=3937)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmulpd, ROUNDING = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_mul_round_pd<const ROUNDING: i32>(a: __m512d, b: __m512d) -> __m512d {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f64x8();
    let b = b.as_f64x8();
    let r = vmulpd(a, b, ROUNDING);
    transmute(r)
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_mul_round_pd&expand=3935)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmulpd, ROUNDING = 8))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask_mul_round_pd<const ROUNDING: i32>(
    src: __m512d,
    k: __mmask8,
    a: __m512d,
    b: __m512d,
) -> __m512d {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f64x8();
    let b = b.as_f64x8();
    let r = vmulpd(a, b, ROUNDING);
    transmute(simd_select_bitmask(k, r, src.as_f64x8()))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_mul_round_ps&expand=3939)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmulpd, ROUNDING = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_maskz_mul_round_pd<const ROUNDING: i32>(
    k: __mmask8,
    a: __m512d,
    b: __m512d,
) -> __m512d {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f64x8();
    let b = b.as_f64x8();
    let r = vmulpd(a, b, ROUNDING);
    let zero = _mm512_setzero_pd().as_f64x8();
    transmute(simd_select_bitmask(k, r, zero))
}

/// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_div_round_ps&expand=2168)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vdivps, ROUNDING = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_div_round_ps<const ROUNDING: i32>(a: __m512, b: __m512) -> __m512 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x16();
    let b = b.as_f32x16();
    let r = vdivps(a, b, ROUNDING);
    transmute(r)
}

/// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_div_round_ps&expand=2169)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vdivps, ROUNDING = 8))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask_div_round_ps<const ROUNDING: i32>(
    src: __m512,
    k: __mmask16,
    a: __m512,
    b: __m512,
) -> __m512 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x16();
    let b = b.as_f32x16();
    let r = vdivps(a, b, ROUNDING);
    transmute(simd_select_bitmask(k, r, src.as_f32x16()))
}

/// Divide packed single-precision (32-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_div_round_ps&expand=2170)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vdivps, ROUNDING = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_maskz_div_round_ps<const ROUNDING: i32>(
    k: __mmask16,
    a: __m512,
    b: __m512,
) -> __m512 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x16();
    let b = b.as_f32x16();
    let r = vdivps(a, b, ROUNDING);
    let zero = _mm512_setzero_ps().as_f32x16();
    transmute(simd_select_bitmask(k, r, zero))
}

/// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, =and store the results in dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_div_round_pd&expand=2165)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vdivpd, ROUNDING = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_div_round_pd<const ROUNDING: i32>(a: __m512d, b: __m512d) -> __m512d {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f64x8();
    let b = b.as_f64x8();
    let r = vdivpd(a, b, ROUNDING);
    transmute(r)
}

/// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_div_round_pd&expand=2166)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vdivpd, ROUNDING = 8))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask_div_round_pd<const ROUNDING: i32>(
    src: __m512d,
    k: __mmask8,
    a: __m512d,
    b: __m512d,
) -> __m512d {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f64x8();
    let b = b.as_f64x8();
    let r = vdivpd(a, b, ROUNDING);
    transmute(simd_select_bitmask(k, r, src.as_f64x8()))
}

/// Divide packed double-precision (64-bit) floating-point elements in a by packed elements in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_div_round_pd&expand=2167)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vdivpd, ROUNDING = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_maskz_div_round_pd<const ROUNDING: i32>(
    k: __mmask8,
    a: __m512d,
    b: __m512d,
) -> __m512d {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f64x8();
    let b = b.as_f64x8();
    let r = vdivpd(a, b, ROUNDING);
    let zero = _mm512_setzero_pd().as_f64x8();
    transmute(simd_select_bitmask(k, r, zero))
}

/// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sqrt_round_ps&expand=5377)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vsqrtps, ROUNDING = 8))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm512_sqrt_round_ps<const ROUNDING: i32>(a: __m512) -> __m512 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x16();
    let r = vsqrtps(a, ROUNDING);
    transmute(r)
}

/// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sqrt_round_ps&expand=5375)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vsqrtps, ROUNDING = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_mask_sqrt_round_ps<const ROUNDING: i32>(
    src: __m512,
    k: __mmask16,
    a: __m512,
) -> __m512 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x16();
    let r = vsqrtps(a, ROUNDING);
    transmute(simd_select_bitmask(k, r, src.as_f32x16()))
}

/// Compute the square root of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sqrt_round_ps&expand=5376)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vsqrtps, ROUNDING = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_maskz_sqrt_round_ps<const ROUNDING: i32>(k: __mmask16, a: __m512) -> __m512 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x16();
    let r = vsqrtps(a, ROUNDING);
    let zero = _mm512_setzero_ps().as_f32x16();
    transmute(simd_select_bitmask(k, r, zero))
}

/// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sqrt_round_pd&expand=5374)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vsqrtpd, ROUNDING = 8))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm512_sqrt_round_pd<const ROUNDING: i32>(a: __m512d) -> __m512d {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f64x8();
    let r = vsqrtpd(a, ROUNDING);
    transmute(r)
}

/// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sqrt_round_pd&expand=5372)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vsqrtpd, ROUNDING = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_mask_sqrt_round_pd<const ROUNDING: i32>(
    src: __m512d,
    k: __mmask8,
    a: __m512d,
) -> __m512d {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f64x8();
    let r = vsqrtpd(a, ROUNDING);
    transmute(simd_select_bitmask(k, r, src.as_f64x8()))
}

/// Compute the square root of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sqrt_round_pd&expand=5373)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vsqrtpd, ROUNDING = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_maskz_sqrt_round_pd<const ROUNDING: i32>(k: __mmask8, a: __m512d) -> __m512d {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f64x8();
    let r = vsqrtpd(a, ROUNDING);
    let zero = _mm512_setzero_pd().as_f64x8();
    transmute(simd_select_bitmask(k, r, zero))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_round_ps&expand=2565)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_fmadd_round_ps<const ROUNDING: i32>(
    a: __m512,
    b: __m512,
    c: __m512,
) -> __m512 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x16();
    let b = b.as_f32x16();
    let c = c.as_f32x16();
    let r = vfmadd132psround(a, b, c, ROUNDING);
    transmute(r)
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_round_ps&expand=2566)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask_fmadd_round_ps<const ROUNDING: i32>(
    a: __m512,
    k: __mmask16,
    b: __m512,
    c: __m512,
) -> __m512 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x16();
    let b = b.as_f32x16();
    let c = c.as_f32x16();
    let r = vfmadd132psround(a, b, c, ROUNDING);
    transmute(simd_select_bitmask(k, r, a))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in a using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_round_ps&expand=2568)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_maskz_fmadd_round_ps<const ROUNDING: i32>(
    k: __mmask16,
    a: __m512,
    b: __m512,
    c: __m512,
) -> __m512 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x16();
    let b = b.as_f32x16();
    let c = c.as_f32x16();
    let r = vfmadd132psround(a, b, c, ROUNDING);
    let zero = _mm512_setzero_ps().as_f32x16();
    transmute(simd_select_bitmask(k, r, zero))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_round_ps&expand=2567)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmadd132ps or vfmadd213ps or vfmadd231ps
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask3_fmadd_round_ps<const ROUNDING: i32>(
    a: __m512,
    b: __m512,
    c: __m512,
    k: __mmask16,
) -> __m512 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x16();
    let b = b.as_f32x16();
    let c = c.as_f32x16();
    let r = vfmadd132psround(a, b, c, ROUNDING);
    transmute(simd_select_bitmask(k, r, c))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmadd_round_pd&expand=2561)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_fmadd_round_pd<const ROUNDING: i32>(
    a: __m512d,
    b: __m512d,
    c: __m512d,
) -> __m512d {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f64x8();
    let b = b.as_f64x8();
    let c = c.as_f64x8();
    let r = vfmadd132pdround(a, b, c, ROUNDING);
    transmute(r)
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmadd_round_pd&expand=2562)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask_fmadd_round_pd<const ROUNDING: i32>(
    a: __m512d,
    k: __mmask8,
    b: __m512d,
    c: __m512d,
) -> __m512d {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f64x8();
    let b = b.as_f64x8();
    let c = c.as_f64x8();
    let r = vfmadd132pdround(a, b, c, ROUNDING);
    transmute(simd_select_bitmask(k, r, a))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmadd_round_pd&expand=2564)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_maskz_fmadd_round_pd<const ROUNDING: i32>(
    k: __mmask8,
    a: __m512d,
    b: __m512d,
    c: __m512d,
) -> __m512d {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f64x8();
    let b = b.as_f64x8();
    let c = c.as_f64x8();
    let r = vfmadd132pdround(a, b, c, ROUNDING);
    let zero = _mm512_setzero_pd().as_f64x8();
    transmute(simd_select_bitmask(k, r, zero))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmadd_round_pd&expand=2563)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmadd132pd or vfmadd213pd or vfmadd231pd
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask3_fmadd_round_pd<const ROUNDING: i32>(
    a: __m512d,
    b: __m512d,
    c: __m512d,
    k: __mmask8,
) -> __m512d {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f64x8();
    let b = b.as_f64x8();
    let c = c.as_f64x8();
    let r = vfmadd132pdround(a, b, c, ROUNDING);
    transmute(simd_select_bitmask(k, r, c))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsub_round_ps&expand=2651)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_fmsub_round_ps<const ROUNDING: i32>(
    a: __m512,
    b: __m512,
    c: __m512,
) -> __m512 {
    static_assert_rounding!(ROUNDING);
    let zero: f32x16 = mem::zeroed();
    let sub = simd_sub(zero, c.as_f32x16());
    let a = a.as_f32x16();
    let b = b.as_f32x16();
    let r = vfmadd132psround(a, b, sub, ROUNDING);
    transmute(r)
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsub_round_ps&expand=2652)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask_fmsub_round_ps<const ROUNDING: i32>(
    a: __m512,
    k: __mmask16,
    b: __m512,
    c: __m512,
) -> __m512 {
    static_assert_rounding!(ROUNDING);
    let zero: f32x16 = mem::zeroed();
    let sub = simd_sub(zero, c.as_f32x16());
    let a = a.as_f32x16();
    let b = b.as_f32x16();
    let r = vfmadd132psround(a, b, sub, ROUNDING);
    transmute(simd_select_bitmask(k, r, a))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsub_round_ps&expand=2654)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_maskz_fmsub_round_ps<const ROUNDING: i32>(
    k: __mmask16,
    a: __m512,
    b: __m512,
    c: __m512,
) -> __m512 {
    static_assert_rounding!(ROUNDING);
    let zero: f32x16 = mem::zeroed();
    let sub = simd_sub(zero, c.as_f32x16());
    let a = a.as_f32x16();
    let b = b.as_f32x16();
    let r = vfmadd132psround(a, b, sub, ROUNDING);
    transmute(simd_select_bitmask(k, r, zero))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsub_round_ps&expand=2653)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmsub132ps or vfmsub213ps or vfmsub231ps, clang generates vfmadd, gcc generates vfmsub
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask3_fmsub_round_ps<const ROUNDING: i32>(
    a: __m512,
    b: __m512,
    c: __m512,
    k: __mmask16,
) -> __m512 {
    static_assert_rounding!(ROUNDING);
    let zero: f32x16 = mem::zeroed();
    let c = c.as_f32x16();
    let sub = simd_sub(zero, c);
    let a = a.as_f32x16();
    let b = b.as_f32x16();
    let r = vfmadd132psround(a, b, sub, ROUNDING);
    transmute(simd_select_bitmask(k, r, c))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsub_round_pd&expand=2647)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_fmsub_round_pd<const ROUNDING: i32>(
    a: __m512d,
    b: __m512d,
    c: __m512d,
) -> __m512d {
    static_assert_rounding!(ROUNDING);
    let zero: f64x8 = mem::zeroed();
    let sub = simd_sub(zero, c.as_f64x8());
    let a = a.as_f64x8();
    let b = b.as_f64x8();
    let r = vfmadd132pdround(a, b, sub, ROUNDING);
    transmute(r)
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsub_round_pd&expand=2648)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask_fmsub_round_pd<const ROUNDING: i32>(
    a: __m512d,
    k: __mmask8,
    b: __m512d,
    c: __m512d,
) -> __m512d {
    static_assert_rounding!(ROUNDING);
    let zero: f64x8 = mem::zeroed();
    let sub = simd_sub(zero, c.as_f64x8());
    let a = a.as_f64x8();
    let b = b.as_f64x8();
    let r = vfmadd132pdround(a, b, sub, ROUNDING);
    transmute(simd_select_bitmask(k, r, a))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsub_round_pd&expand=2650)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_maskz_fmsub_round_pd<const ROUNDING: i32>(
    k: __mmask8,
    a: __m512d,
    b: __m512d,
    c: __m512d,
) -> __m512d {
    static_assert_rounding!(ROUNDING);
    let zero: f64x8 = mem::zeroed();
    let sub = simd_sub(zero, c.as_f64x8());
    let a = a.as_f64x8();
    let b = b.as_f64x8();
    let r = vfmadd132pdround(a, b, sub, ROUNDING);
    transmute(simd_select_bitmask(k, r, zero))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsub_round_pd&expand=2649)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfmsub132pd or vfmsub213pd or vfmsub231pd. clang generates fmadd, gcc generates fmsub
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask3_fmsub_round_pd<const ROUNDING: i32>(
    a: __m512d,
    b: __m512d,
    c: __m512d,
    k: __mmask8,
) -> __m512d {
    static_assert_rounding!(ROUNDING);
    let zero: f64x8 = mem::zeroed();
    let c = c.as_f64x8();
    let sub = simd_sub(zero, c);
    let a = a.as_f64x8();
    let b = b.as_f64x8();
    let r = vfmadd132pdround(a, b, sub, ROUNDING);
    transmute(simd_select_bitmask(k, r, c))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmaddsub_round_ps&expand=2619)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_fmaddsub_round_ps<const ROUNDING: i32>(
    a: __m512,
    b: __m512,
    c: __m512,
) -> __m512 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x16();
    let b = b.as_f32x16();
    let c = c.as_f32x16();
    let r = vfmaddsub213ps(a, b, c, ROUNDING);
    transmute(r)
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmaddsub_round_ps&expand=2620)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask_fmaddsub_round_ps<const ROUNDING: i32>(
    a: __m512,
    k: __mmask16,
    b: __m512,
    c: __m512,
) -> __m512 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x16();
    let b = b.as_f32x16();
    let c = c.as_f32x16();
    let r = vfmaddsub213ps(a, b, c, ROUNDING);
    transmute(simd_select_bitmask(k, r, a))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmaddsub_round_ps&expand=2622)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_maskz_fmaddsub_round_ps<const ROUNDING: i32>(
    k: __mmask16,
    a: __m512,
    b: __m512,
    c: __m512,
) -> __m512 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x16();
    let b = b.as_f32x16();
    let c = c.as_f32x16();
    let r = vfmaddsub213ps(a, b, c, ROUNDING);
    let zero = _mm512_setzero_ps().as_f32x16();
    transmute(simd_select_bitmask(k, r, zero))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmaddsub_round_ps&expand=2621)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmaddsub132ps or vfmaddsub213ps or vfmaddsub231ps
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask3_fmaddsub_round_ps<const ROUNDING: i32>(
    a: __m512,
    b: __m512,
    c: __m512,
    k: __mmask16,
) -> __m512 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x16();
    let b = b.as_f32x16();
    let c = c.as_f32x16();
    let r = vfmaddsub213ps(a, b, c, ROUNDING);
    transmute(simd_select_bitmask(k, r, c))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmaddsub_round_pd&expand=2615)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_fmaddsub_round_pd<const ROUNDING: i32>(
    a: __m512d,
    b: __m512d,
    c: __m512d,
) -> __m512d {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f64x8();
    let b = b.as_f64x8();
    let c = c.as_f64x8();
    let r = vfmaddsub213pd(a, b, c, ROUNDING);
    transmute(r)
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmaddsub_round_pd&expand=2616)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask_fmaddsub_round_pd<const ROUNDING: i32>(
    a: __m512d,
    k: __mmask8,
    b: __m512d,
    c: __m512d,
) -> __m512d {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f64x8();
    let b = b.as_f64x8();
    let c = c.as_f64x8();
    let r = vfmaddsub213pd(a, b, c, ROUNDING);
    transmute(simd_select_bitmask(k, r, a))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmaddsub_round_pd&expand=2618)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_maskz_fmaddsub_round_pd<const ROUNDING: i32>(
    k: __mmask8,
    a: __m512d,
    b: __m512d,
    c: __m512d,
) -> __m512d {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f64x8();
    let b = b.as_f64x8();
    let c = c.as_f64x8();
    let r = vfmaddsub213pd(a, b, c, ROUNDING);
    let zero = _mm512_setzero_pd().as_f64x8();
    transmute(simd_select_bitmask(k, r, zero))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmaddsub_round_pd&expand=2617)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmaddsub132pd or vfmaddsub213pd or vfmaddsub231pd
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask3_fmaddsub_round_pd<const ROUNDING: i32>(
    a: __m512d,
    b: __m512d,
    c: __m512d,
    k: __mmask8,
) -> __m512d {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f64x8();
    let b = b.as_f64x8();
    let c = c.as_f64x8();
    let r = vfmaddsub213pd(a, b, c, ROUNDING);
    transmute(simd_select_bitmask(k, r, c))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsubadd_round_ps&expand=2699)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_fmsubadd_round_ps<const ROUNDING: i32>(
    a: __m512,
    b: __m512,
    c: __m512,
) -> __m512 {
    static_assert_rounding!(ROUNDING);
    let zero: f32x16 = mem::zeroed();
    let sub = simd_sub(zero, c.as_f32x16());
    let a = a.as_f32x16();
    let b = b.as_f32x16();
    let r = vfmaddsub213ps(a, b, sub, ROUNDING);
    transmute(r)
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsubadd_round_ps&expand=2700)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask_fmsubadd_round_ps<const ROUNDING: i32>(
    a: __m512,
    k: __mmask16,
    b: __m512,
    c: __m512,
) -> __m512 {
    static_assert_rounding!(ROUNDING);
    let zero: f32x16 = mem::zeroed();
    let sub = simd_sub(zero, c.as_f32x16());
    let a = a.as_f32x16();
    let b = b.as_f32x16();
    let r = vfmaddsub213ps(a, b, sub, ROUNDING);
    transmute(simd_select_bitmask(k, r, a))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsubadd_round_ps&expand=2702)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_maskz_fmsubadd_round_ps<const ROUNDING: i32>(
    k: __mmask16,
    a: __m512,
    b: __m512,
    c: __m512,
) -> __m512 {
    static_assert_rounding!(ROUNDING);
    let zero: f32x16 = mem::zeroed();
    let sub = simd_sub(zero, c.as_f32x16());
    let a = a.as_f32x16();
    let b = b.as_f32x16();
    let r = vfmaddsub213ps(a, b, sub, ROUNDING);
    transmute(simd_select_bitmask(k, r, zero))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsubadd_round_ps&expand=2701)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmsubadd132ps or vfmsubadd213ps or vfmsubadd231ps
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask3_fmsubadd_round_ps<const ROUNDING: i32>(
    a: __m512,
    b: __m512,
    c: __m512,
    k: __mmask16,
) -> __m512 {
    static_assert_rounding!(ROUNDING);
    let zero: f32x16 = mem::zeroed();
    let c = c.as_f32x16();
    let sub = simd_sub(zero, c);
    let a = a.as_f32x16();
    let b = b.as_f32x16();
    let r = vfmaddsub213ps(a, b, sub, ROUNDING);
    transmute(simd_select_bitmask(k, r, c))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fmsubadd_round_pd&expand=2695)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_fmsubadd_round_pd<const ROUNDING: i32>(
    a: __m512d,
    b: __m512d,
    c: __m512d,
) -> __m512d {
    static_assert_rounding!(ROUNDING);
    let zero: f64x8 = mem::zeroed();
    let sub = simd_sub(zero, c.as_f64x8());
    let a = a.as_f64x8();
    let b = b.as_f64x8();
    let r = vfmaddsub213pd(a, b, sub, ROUNDING);
    transmute(r)
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fmsubadd_round_pd&expand=2696)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask_fmsubadd_round_pd<const ROUNDING: i32>(
    a: __m512d,
    k: __mmask8,
    b: __m512d,
    c: __m512d,
) -> __m512d {
    static_assert_rounding!(ROUNDING);
    let zero: f64x8 = mem::zeroed();
    let sub = simd_sub(zero, c.as_f64x8());
    let a = a.as_f64x8();
    let b = b.as_f64x8();
    let r = vfmaddsub213pd(a, b, sub, ROUNDING);
    transmute(simd_select_bitmask(k, r, a))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively add and subtract packed elements in c to/from the intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fmsubadd_round_pd&expand=2698)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_maskz_fmsubadd_round_pd<const ROUNDING: i32>(
    k: __mmask8,
    a: __m512d,
    b: __m512d,
    c: __m512d,
) -> __m512d {
    static_assert_rounding!(ROUNDING);
    let zero: f64x8 = mem::zeroed();
    let sub = simd_sub(zero, c.as_f64x8());
    let a = a.as_f64x8();
    let b = b.as_f64x8();
    let r = vfmaddsub213pd(a, b, sub, ROUNDING);
    transmute(simd_select_bitmask(k, r, zero))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, alternatively subtract and add packed elements in c from/to the intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fmsubadd_round_pd&expand=2697)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmaddsub, ROUNDING = 8))] //vfmsubadd132pd or vfmsubadd213pd or vfmsubadd231pd
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask3_fmsubadd_round_pd<const ROUNDING: i32>(
    a: __m512d,
    b: __m512d,
    c: __m512d,
    k: __mmask8,
) -> __m512d {
    static_assert_rounding!(ROUNDING);
    let zero: f64x8 = mem::zeroed();
    let c = c.as_f64x8();
    let sub = simd_sub(zero, c);
    let a = a.as_f64x8();
    let b = b.as_f64x8();
    let r = vfmaddsub213pd(a, b, sub, ROUNDING);
    transmute(simd_select_bitmask(k, r, c))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmadd_round_ps&expand=2731)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_fnmadd_round_ps<const ROUNDING: i32>(
    a: __m512,
    b: __m512,
    c: __m512,
) -> __m512 {
    static_assert_rounding!(ROUNDING);
    let zero: f32x16 = mem::zeroed();
    let sub = simd_sub(zero, a.as_f32x16());
    let b = b.as_f32x16();
    let c = c.as_f32x16();
    let r = vfmadd132psround(sub, b, c, ROUNDING);
    transmute(r)
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmadd_round_ps&expand=2732)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask_fnmadd_round_ps<const ROUNDING: i32>(
    a: __m512,
    k: __mmask16,
    b: __m512,
    c: __m512,
) -> __m512 {
    static_assert_rounding!(ROUNDING);
    let zero: f32x16 = mem::zeroed();
    let sub = simd_sub(zero, a.as_f32x16());
    let b = b.as_f32x16();
    let c = c.as_f32x16();
    let r = vfmadd132psround(sub, b, c, ROUNDING);
    transmute(simd_select_bitmask(k, r, a.as_f32x16()))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmadd_round_ps&expand=2734)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_maskz_fnmadd_round_ps<const ROUNDING: i32>(
    k: __mmask16,
    a: __m512,
    b: __m512,
    c: __m512,
) -> __m512 {
    static_assert_rounding!(ROUNDING);
    let zero: f32x16 = mem::zeroed();
    let sub = simd_sub(zero, a.as_f32x16());
    let b = b.as_f32x16();
    let c = c.as_f32x16();
    let r = vfmadd132psround(sub, b, c, ROUNDING);
    transmute(simd_select_bitmask(k, r, zero))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmadd_round_ps&expand=2733)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmadd132ps or vfnmadd213ps or vfnmadd231ps
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask3_fnmadd_round_ps<const ROUNDING: i32>(
    a: __m512,
    b: __m512,
    c: __m512,
    k: __mmask16,
) -> __m512 {
    static_assert_rounding!(ROUNDING);
    let zero: f32x16 = mem::zeroed();
    let sub = simd_sub(zero, a.as_f32x16());
    let b = b.as_f32x16();
    let c = c.as_f32x16();
    let r = vfmadd132psround(sub, b, c, ROUNDING);
    transmute(simd_select_bitmask(k, r, c))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmadd_pd&expand=2711)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_fnmadd_round_pd<const ROUNDING: i32>(
    a: __m512d,
    b: __m512d,
    c: __m512d,
) -> __m512d {
    static_assert_rounding!(ROUNDING);
    let zero: f64x8 = mem::zeroed();
    let sub = simd_sub(zero, a.as_f64x8());
    let b = b.as_f64x8();
    let c = c.as_f64x8();
    let r = vfmadd132pdround(sub, b, c, ROUNDING);
    transmute(r)
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmadd_round_pd&expand=2728)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask_fnmadd_round_pd<const ROUNDING: i32>(
    a: __m512d,
    k: __mmask8,
    b: __m512d,
    c: __m512d,
) -> __m512d {
    static_assert_rounding!(ROUNDING);
    let zero: f64x8 = mem::zeroed();
    let a = a.as_f64x8();
    let sub = simd_sub(zero, a);
    let b = b.as_f64x8();
    let c = c.as_f64x8();
    let r = vfmadd132pdround(sub, b, c, ROUNDING);
    transmute(simd_select_bitmask(k, r, a))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmadd_round_pd&expand=2730)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_maskz_fnmadd_round_pd<const ROUNDING: i32>(
    k: __mmask8,
    a: __m512d,
    b: __m512d,
    c: __m512d,
) -> __m512d {
    static_assert_rounding!(ROUNDING);
    let zero: f64x8 = mem::zeroed();
    let sub = simd_sub(zero, a.as_f64x8());
    let b = b.as_f64x8();
    let c = c.as_f64x8();
    let r = vfmadd132pdround(sub, b, c, ROUNDING);
    transmute(simd_select_bitmask(k, r, zero))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, add the negated intermediate result to packed elements in c, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmadd_round_pd&expand=2729)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmadd132pd or vfnmadd213pd or vfnmadd231pd
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask3_fnmadd_round_pd<const ROUNDING: i32>(
    a: __m512d,
    b: __m512d,
    c: __m512d,
    k: __mmask8,
) -> __m512d {
    static_assert_rounding!(ROUNDING);
    let zero: f64x8 = mem::zeroed();
    let sub = simd_sub(zero, a.as_f64x8());
    let b = b.as_f64x8();
    let c = c.as_f64x8();
    let r = vfmadd132pdround(sub, b, c, ROUNDING);
    transmute(simd_select_bitmask(k, r, c))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmsub_round_ps&expand=2779)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_fnmsub_round_ps<const ROUNDING: i32>(
    a: __m512,
    b: __m512,
    c: __m512,
) -> __m512 {
    static_assert_rounding!(ROUNDING);
    let zero: f32x16 = mem::zeroed();
    let suba = simd_sub(zero, a.as_f32x16());
    let subc = simd_sub(zero, c.as_f32x16());
    let b = b.as_f32x16();
    let r = vfmadd132psround(suba, b, subc, ROUNDING);
    transmute(r)
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmsub_round_ps&expand=2780)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask_fnmsub_round_ps<const ROUNDING: i32>(
    a: __m512,
    k: __mmask16,
    b: __m512,
    c: __m512,
) -> __m512 {
    static_assert_rounding!(ROUNDING);
    let zero: f32x16 = mem::zeroed();
    let a = a.as_f32x16();
    let suba = simd_sub(zero, a);
    let subc = simd_sub(zero, c.as_f32x16());
    let b = b.as_f32x16();
    let r = vfmadd132psround(suba, b, subc, ROUNDING);
    transmute(simd_select_bitmask(k, r, a))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmsub_round_ps&expand=2782)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_maskz_fnmsub_round_ps<const ROUNDING: i32>(
    k: __mmask16,
    a: __m512,
    b: __m512,
    c: __m512,
) -> __m512 {
    static_assert_rounding!(ROUNDING);
    let zero: f32x16 = mem::zeroed();
    let suba = simd_sub(zero, a.as_f32x16());
    let subc = simd_sub(zero, c.as_f32x16());
    let b = b.as_f32x16();
    let r = vfmadd132psround(suba, b, subc, ROUNDING);
    transmute(simd_select_bitmask(k, r, zero))
}

/// Multiply packed single-precision (32-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmsub_round_ps&expand=2781)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmsub132ps or vfnmsub213ps or vfnmsub231ps
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask3_fnmsub_round_ps<const ROUNDING: i32>(
    a: __m512,
    b: __m512,
    c: __m512,
    k: __mmask16,
) -> __m512 {
    static_assert_rounding!(ROUNDING);
    let zero: f32x16 = mem::zeroed();
    let suba = simd_sub(zero, a.as_f32x16());
    let c = c.as_f32x16();
    let subc = simd_sub(zero, c);
    let b = b.as_f32x16();
    let r = vfmadd132psround(suba, b, subc, ROUNDING);
    transmute(simd_select_bitmask(k, r, c))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fnmsub_round_pd&expand=2775)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_fnmsub_round_pd<const ROUNDING: i32>(
    a: __m512d,
    b: __m512d,
    c: __m512d,
) -> __m512d {
    static_assert_rounding!(ROUNDING);
    let zero: f64x8 = mem::zeroed();
    let suba = simd_sub(zero, a.as_f64x8());
    let subc = simd_sub(zero, c.as_f64x8());
    let b = b.as_f64x8();
    let r = vfmadd132pdround(suba, b, subc, ROUNDING);
    transmute(r)
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fnmsub_round_pd&expand=2776)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask_fnmsub_round_pd<const ROUNDING: i32>(
    a: __m512d,
    k: __mmask8,
    b: __m512d,
    c: __m512d,
) -> __m512d {
    static_assert_rounding!(ROUNDING);
    let zero: f64x8 = mem::zeroed();
    let a = a.as_f64x8();
    let suba = simd_sub(zero, a);
    let subc = simd_sub(zero, c.as_f64x8());
    let b = b.as_f64x8();
    let r = vfmadd132pdround(suba, b, subc, ROUNDING);
    transmute(simd_select_bitmask(k, r, a))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fnmsub_round_pd&expand=2778)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_maskz_fnmsub_round_pd<const ROUNDING: i32>(
    k: __mmask8,
    a: __m512d,
    b: __m512d,
    c: __m512d,
) -> __m512d {
    static_assert_rounding!(ROUNDING);
    let zero: f64x8 = mem::zeroed();
    let suba = simd_sub(zero, a.as_f64x8());
    let subc = simd_sub(zero, c.as_f64x8());
    let b = b.as_f64x8();
    let r = vfmadd132pdround(suba, b, subc, ROUNDING);
    transmute(simd_select_bitmask(k, r, zero))
}

/// Multiply packed double-precision (64-bit) floating-point elements in a and b, subtract packed elements in c from the negated intermediate result, and store the results in dst using writemask k (elements are copied from c when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask3_fnmsub_round_pd&expand=2777)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd, ROUNDING = 8))] //vfnmsub132pd or vfnmsub213pd or vfnmsub231pd
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask3_fnmsub_round_pd<const ROUNDING: i32>(
    a: __m512d,
    b: __m512d,
    c: __m512d,
    k: __mmask8,
) -> __m512d {
    static_assert_rounding!(ROUNDING);
    let zero: f64x8 = mem::zeroed();
    let suba = simd_sub(zero, a.as_f64x8());
    let c = c.as_f64x8();
    let subc = simd_sub(zero, c);
    let b = b.as_f64x8();
    let r = vfmadd132pdround(suba, b, subc, ROUNDING);
    transmute(simd_select_bitmask(k, r, c))
}

/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst.\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_max_round_ps&expand=3662)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmaxps, SAE = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_max_round_ps<const SAE: i32>(a: __m512, b: __m512) -> __m512 {
    static_assert_sae!(SAE);
    let a = a.as_f32x16();
    let b = b.as_f32x16();
    let r = vmaxps(a, b, SAE);
    transmute(r)
}

/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_max_round_ps&expand=3660)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmaxps, SAE = 8))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask_max_round_ps<const SAE: i32>(
    src: __m512,
    k: __mmask16,
    a: __m512,
    b: __m512,
) -> __m512 {
    static_assert_sae!(SAE);
    let a = a.as_f32x16();
    let b = b.as_f32x16();
    let r = vmaxps(a, b, SAE);
    transmute(simd_select_bitmask(k, r, src.as_f32x16()))
}

/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_max_round_ps&expand=3661)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmaxps, SAE = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_maskz_max_round_ps<const SAE: i32>(
    k: __mmask16,
    a: __m512,
    b: __m512,
) -> __m512 {
    static_assert_sae!(SAE);
    let a = a.as_f32x16();
    let b = b.as_f32x16();
    let r = vmaxps(a, b, SAE);
    let zero = _mm512_setzero_ps().as_f32x16();
    transmute(simd_select_bitmask(k, r, zero))
}

/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst.\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_max_round_pd&expand=3659)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmaxpd, SAE = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_max_round_pd<const SAE: i32>(a: __m512d, b: __m512d) -> __m512d {
    static_assert_sae!(SAE);
    let a = a.as_f64x8();
    let b = b.as_f64x8();
    let r = vmaxpd(a, b, SAE);
    transmute(r)
}

/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_max_round_pd&expand=3657)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmaxpd, SAE = 8))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask_max_round_pd<const SAE: i32>(
    src: __m512d,
    k: __mmask8,
    a: __m512d,
    b: __m512d,
) -> __m512d {
    static_assert_sae!(SAE);
    let a = a.as_f64x8();
    let b = b.as_f64x8();
    let r = vmaxpd(a, b, SAE);
    transmute(simd_select_bitmask(k, r, src.as_f64x8()))
}

/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed maximum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_max_round_pd&expand=3658)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmaxpd, SAE = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_maskz_max_round_pd<const SAE: i32>(
    k: __mmask8,
    a: __m512d,
    b: __m512d,
) -> __m512d {
    static_assert_sae!(SAE);
    let a = a.as_f64x8();
    let b = b.as_f64x8();
    let r = vmaxpd(a, b, SAE);
    let zero = _mm512_setzero_pd().as_f64x8();
    transmute(simd_select_bitmask(k, r, zero))
}

/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst.\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_min_round_ps&expand=3776)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vminps, SAE = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_min_round_ps<const SAE: i32>(a: __m512, b: __m512) -> __m512 {
    static_assert_sae!(SAE);
    let a = a.as_f32x16();
    let b = b.as_f32x16();
    let r = vminps(a, b, SAE);
    transmute(r)
}

/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_min_round_ps&expand=3774)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vminps, SAE = 8))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask_min_round_ps<const SAE: i32>(
    src: __m512,
    k: __mmask16,
    a: __m512,
    b: __m512,
) -> __m512 {
    static_assert_sae!(SAE);
    let a = a.as_f32x16();
    let b = b.as_f32x16();
    let r = vminps(a, b, SAE);
    transmute(simd_select_bitmask(k, r, src.as_f32x16()))
}

/// Compare packed single-precision (32-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_min_round_ps&expand=3775)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vminps, SAE = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_maskz_min_round_ps<const SAE: i32>(
    k: __mmask16,
    a: __m512,
    b: __m512,
) -> __m512 {
    static_assert_sae!(SAE);
    let a = a.as_f32x16();
    let b = b.as_f32x16();
    let r = vminps(a, b, SAE);
    let zero = _mm512_setzero_ps().as_f32x16();
    transmute(simd_select_bitmask(k, r, zero))
}

/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst.\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_min_round_pd&expand=3773)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vminpd, SAE = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_min_round_pd<const SAE: i32>(a: __m512d, b: __m512d) -> __m512d {
    static_assert_sae!(SAE);
    let a = a.as_f64x8();
    let b = b.as_f64x8();
    let r = vminpd(a, b, SAE);
    transmute(r)
}

/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_min_round_pd&expand=3771)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vminpd, SAE = 8))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask_min_round_pd<const SAE: i32>(
    src: __m512d,
    k: __mmask8,
    a: __m512d,
    b: __m512d,
) -> __m512d {
    static_assert_sae!(SAE);
    let a = a.as_f64x8();
    let b = b.as_f64x8();
    let r = vminpd(a, b, SAE);
    transmute(simd_select_bitmask(k, r, src.as_f64x8()))
}

/// Compare packed double-precision (64-bit) floating-point elements in a and b, and store packed minimum values in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_min_round_pd&expand=3772)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vminpd, SAE = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_maskz_min_round_pd<const SAE: i32>(
    k: __mmask8,
    a: __m512d,
    b: __m512d,
) -> __m512d {
    static_assert_sae!(SAE);
    let a = a.as_f64x8();
    let b = b.as_f64x8();
    let r = vminpd(a, b, SAE);
    let zero = _mm512_setzero_pd().as_f64x8();
    transmute(simd_select_bitmask(k, r, zero))
}

/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getexp_round_ps&expand=2850)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgetexpps, SAE = 8))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm512_getexp_round_ps<const SAE: i32>(a: __m512) -> __m512 {
    static_assert_sae!(SAE);
    let a = a.as_f32x16();
    let zero = _mm512_setzero_ps().as_f32x16();
    let r = vgetexpps(a, zero, 0b11111111_11111111, SAE);
    transmute(r)
}

/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getexp_round_ps&expand=2851)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgetexpps, SAE = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_mask_getexp_round_ps<const SAE: i32>(
    src: __m512,
    k: __mmask16,
    a: __m512,
) -> __m512 {
    static_assert_sae!(SAE);
    let a = a.as_f32x16();
    let src = src.as_f32x16();
    let r = vgetexpps(a, src, k, SAE);
    transmute(r)
}

/// Convert the exponent of each packed single-precision (32-bit) floating-point element in a to a single-precision (32-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getexp_round_ps&expand=2852)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgetexpps, SAE = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_maskz_getexp_round_ps<const SAE: i32>(k: __mmask16, a: __m512) -> __m512 {
    static_assert_sae!(SAE);
    let a = a.as_f32x16();
    let zero = _mm512_setzero_ps().as_f32x16();
    let r = vgetexpps(a, zero, k, SAE);
    transmute(r)
}

/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst. This intrinsic essentially calculates floor(log2(x)) for each element.\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getexp_round_pd&expand=2847)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgetexppd, SAE = 8))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm512_getexp_round_pd<const SAE: i32>(a: __m512d) -> __m512d {
    static_assert_sae!(SAE);
    let a = a.as_f64x8();
    let zero = _mm512_setzero_pd().as_f64x8();
    let r = vgetexppd(a, zero, 0b11111111, SAE);
    transmute(r)
}

/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getexp_round_pd&expand=2848)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgetexppd, SAE = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_mask_getexp_round_pd<const SAE: i32>(
    src: __m512d,
    k: __mmask8,
    a: __m512d,
) -> __m512d {
    static_assert_sae!(SAE);
    let a = a.as_f64x8();
    let src = src.as_f64x8();
    let r = vgetexppd(a, src, k, SAE);
    transmute(r)
}

/// Convert the exponent of each packed double-precision (64-bit) floating-point element in a to a double-precision (64-bit) floating-point number representing the integer exponent, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates floor(log2(x)) for each element.\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getexp_round_pd&expand=2849)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgetexppd, SAE = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_maskz_getexp_round_pd<const SAE: i32>(k: __mmask8, a: __m512d) -> __m512d {
    static_assert_sae!(SAE);
    let a = a.as_f64x8();
    let zero = _mm512_setzero_pd().as_f64x8();
    let r = vgetexppd(a, zero, k, SAE);
    transmute(r)
}

/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.\
/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
///    _MM_FROUND_TO_NEG_INF     // round down\
///    _MM_FROUND_TO_POS_INF     // round up\
///    _MM_FROUND_TO_ZERO        // truncate\
///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE\
///
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_round_ps&expand=4790)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0, SAE = 8))]
#[rustc_legacy_const_generics(1, 2)]
pub unsafe fn _mm512_roundscale_round_ps<const IMM8: i32, const SAE: i32>(a: __m512) -> __m512 {
    static_assert_imm8!(IMM8);
    static_assert_mantissas_sae!(SAE);
    let a = a.as_f32x16();
    let zero = _mm512_setzero_ps().as_f32x16();
    let r = vrndscaleps(a, IMM8, zero, 0b11111111_11111111, SAE);
    transmute(r)
}

/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
///    _MM_FROUND_TO_NEG_INF     // round down\
///    _MM_FROUND_TO_POS_INF     // round up\
///    _MM_FROUND_TO_ZERO        // truncate\
///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE\
///
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_round_ps&expand=4788)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0, SAE = 8))]
#[rustc_legacy_const_generics(3, 4)]
pub unsafe fn _mm512_mask_roundscale_round_ps<const IMM8: i32, const SAE: i32>(
    src: __m512,
    k: __mmask16,
    a: __m512,
) -> __m512 {
    static_assert_imm8!(IMM8);
    static_assert_mantissas_sae!(SAE);
    let a = a.as_f32x16();
    let src = src.as_f32x16();
    let r = vrndscaleps(a, IMM8, src, k, SAE);
    transmute(r)
}

/// Round packed single-precision (32-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
///    _MM_FROUND_TO_NEG_INF     // round down\
///    _MM_FROUND_TO_POS_INF     // round up\
///    _MM_FROUND_TO_ZERO        // truncate\
///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE\
///
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_round_ps&expand=4789)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vrndscaleps, IMM8 = 0, SAE = 8))]
#[rustc_legacy_const_generics(2, 3)]
pub unsafe fn _mm512_maskz_roundscale_round_ps<const IMM8: i32, const SAE: i32>(
    k: __mmask16,
    a: __m512,
) -> __m512 {
    static_assert_imm8!(IMM8);
    static_assert_mantissas_sae!(SAE);
    let a = a.as_f32x16();
    let zero = _mm512_setzero_ps().as_f32x16();
    let r = vrndscaleps(a, IMM8, zero, k, SAE);
    transmute(r)
}

/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst.\
/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
///    _MM_FROUND_TO_NEG_INF     // round down\
///    _MM_FROUND_TO_POS_INF     // round up\
///    _MM_FROUND_TO_ZERO        // truncate\
///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE\
///
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_roundscale_round_pd&expand=4787)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0, SAE = 8))]
#[rustc_legacy_const_generics(1, 2)]
pub unsafe fn _mm512_roundscale_round_pd<const IMM8: i32, const SAE: i32>(a: __m512d) -> __m512d {
    static_assert_imm8!(IMM8);
    static_assert_mantissas_sae!(SAE);
    let a = a.as_f64x8();
    let zero = _mm512_setzero_pd().as_f64x8();
    let r = vrndscalepd(a, IMM8, zero, 0b11111111, SAE);
    transmute(r)
}

/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
///    _MM_FROUND_TO_NEG_INF     // round down\
///    _MM_FROUND_TO_POS_INF     // round up\
///    _MM_FROUND_TO_ZERO        // truncate\
///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE\
///
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_roundscale_round_pd&expand=4785)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0, SAE = 8))]
#[rustc_legacy_const_generics(3, 4)]
pub unsafe fn _mm512_mask_roundscale_round_pd<const IMM8: i32, const SAE: i32>(
    src: __m512d,
    k: __mmask8,
    a: __m512d,
) -> __m512d {
    static_assert_imm8!(IMM8);
    static_assert_mantissas_sae!(SAE);
    let a = a.as_f64x8();
    let src = src.as_f64x8();
    let r = vrndscalepd(a, IMM8, src, k, SAE);
    transmute(r)
}

/// Round packed double-precision (64-bit) floating-point elements in a to the number of fraction bits specified by imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
///    _MM_FROUND_TO_NEG_INF     // round down\
///    _MM_FROUND_TO_POS_INF     // round up\
///    _MM_FROUND_TO_ZERO        // truncate\
///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE\
///
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_roundscale_round_pd&expand=4786)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vrndscalepd, IMM8 = 0, SAE = 8))]
#[rustc_legacy_const_generics(2, 3)]
pub unsafe fn _mm512_maskz_roundscale_round_pd<const IMM8: i32, const SAE: i32>(
    k: __mmask8,
    a: __m512d,
) -> __m512d {
    static_assert_imm8!(IMM8);
    static_assert_mantissas_sae!(SAE);
    let a = a.as_f64x8();
    let zero = _mm512_setzero_pd().as_f64x8();
    let r = vrndscalepd(a, IMM8, zero, k, SAE);
    transmute(r)
}

/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_round_ps&expand=4889)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vscalefps, ROUNDING = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_scalef_round_ps<const ROUNDING: i32>(a: __m512, b: __m512) -> __m512 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x16();
    let b = b.as_f32x16();
    let zero = _mm512_setzero_ps().as_f32x16();
    let r = vscalefps(a, b, zero, 0b11111111_11111111, ROUNDING);
    transmute(r)
}

/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_round_ps&expand=4887)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vscalefps, ROUNDING = 8))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask_scalef_round_ps<const ROUNDING: i32>(
    src: __m512,
    k: __mmask16,
    a: __m512,
    b: __m512,
) -> __m512 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x16();
    let b = b.as_f32x16();
    let src = src.as_f32x16();
    let r = vscalefps(a, b, src, k, ROUNDING);
    transmute(r)
}

/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_round_ps&expand=4888)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vscalefps, ROUNDING = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_maskz_scalef_round_ps<const ROUNDING: i32>(
    k: __mmask16,
    a: __m512,
    b: __m512,
) -> __m512 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x16();
    let b = b.as_f32x16();
    let zero = _mm512_setzero_ps().as_f32x16();
    let r = vscalefps(a, b, zero, k, ROUNDING);
    transmute(r)
}

/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_scalef_round_pd&expand=4886)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vscalefpd, ROUNDING = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_scalef_round_pd<const ROUNDING: i32>(a: __m512d, b: __m512d) -> __m512d {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f64x8();
    let b = b.as_f64x8();
    let zero = _mm512_setzero_pd().as_f64x8();
    let r = vscalefpd(a, b, zero, 0b11111111, ROUNDING);
    transmute(r)
}

/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_scalef_round_pd&expand=4884)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vscalefpd, ROUNDING = 8))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask_scalef_round_pd<const ROUNDING: i32>(
    src: __m512d,
    k: __mmask8,
    a: __m512d,
    b: __m512d,
) -> __m512d {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f64x8();
    let b = b.as_f64x8();
    let src = src.as_f64x8();
    let r = vscalefpd(a, b, src, k, ROUNDING);
    transmute(r)
}

/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_scalef_round_pd&expand=4885)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vscalefpd, ROUNDING = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_maskz_scalef_round_pd<const ROUNDING: i32>(
    k: __mmask8,
    a: __m512d,
    b: __m512d,
) -> __m512d {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f64x8();
    let b = b.as_f64x8();
    let zero = _mm512_setzero_pd().as_f64x8();
    let r = vscalefpd(a, b, zero, k, ROUNDING);
    transmute(r)
}

/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.\
///
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fixupimm_round_ps&expand=2505)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0, SAE = 8))]
#[rustc_legacy_const_generics(3, 4)]
pub unsafe fn _mm512_fixupimm_round_ps<const IMM8: i32, const SAE: i32>(
    a: __m512,
    b: __m512,
    c: __m512i,
) -> __m512 {
    static_assert_imm8!(IMM8);
    static_assert_mantissas_sae!(SAE);
    let a = a.as_f32x16();
    let b = b.as_f32x16();
    let c = c.as_i32x16();
    let r = vfixupimmps(a, b, c, IMM8, 0b11111111_11111111, SAE);
    transmute(r)
}

/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.\
///
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fixupimm_round_ps&expand=2506)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0, SAE = 8))]
#[rustc_legacy_const_generics(4, 5)]
pub unsafe fn _mm512_mask_fixupimm_round_ps<const IMM8: i32, const SAE: i32>(
    a: __m512,
    k: __mmask16,
    b: __m512,
    c: __m512i,
) -> __m512 {
    static_assert_imm8!(IMM8);
    static_assert_mantissas_sae!(SAE);
    let a = a.as_f32x16();
    let b = b.as_f32x16();
    let c = c.as_i32x16();
    let r = vfixupimmps(a, b, c, IMM8, k, SAE);
    transmute(r)
}

/// Fix up packed single-precision (32-bit) floating-point elements in a and b using packed 32-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.\
///
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fixupimm_round_ps&expand=2507)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfixupimmps, IMM8 = 0, SAE = 8))]
#[rustc_legacy_const_generics(4, 5)]
pub unsafe fn _mm512_maskz_fixupimm_round_ps<const IMM8: i32, const SAE: i32>(
    k: __mmask16,
    a: __m512,
    b: __m512,
    c: __m512i,
) -> __m512 {
    static_assert_imm8!(IMM8);
    static_assert_mantissas_sae!(SAE);
    let a = a.as_f32x16();
    let b = b.as_f32x16();
    let c = c.as_i32x16();
    let r = vfixupimmpsz(a, b, c, IMM8, k, SAE);
    transmute(r)
}

/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst. imm8 is used to set the required flags reporting.\
///
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_fixupimm_round_pd&expand=2502)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0, SAE = 8))]
#[rustc_legacy_const_generics(3, 4)]
pub unsafe fn _mm512_fixupimm_round_pd<const IMM8: i32, const SAE: i32>(
    a: __m512d,
    b: __m512d,
    c: __m512i,
) -> __m512d {
    static_assert_imm8!(IMM8);
    static_assert_mantissas_sae!(SAE);
    let a = a.as_f64x8();
    let b = b.as_f64x8();
    let c = c.as_i64x8();
    let r = vfixupimmpd(a, b, c, IMM8, 0b11111111, SAE);
    transmute(r)
}

/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.\
///
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_fixupimm_round_pd&expand=2503)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0, SAE = 8))]
#[rustc_legacy_const_generics(4, 5)]
pub unsafe fn _mm512_mask_fixupimm_round_pd<const IMM8: i32, const SAE: i32>(
    a: __m512d,
    k: __mmask8,
    b: __m512d,
    c: __m512i,
) -> __m512d {
    static_assert_imm8!(IMM8);
    static_assert_mantissas_sae!(SAE);
    let a = a.as_f64x8();
    let b = b.as_f64x8();
    let c = c.as_i64x8();
    let r = vfixupimmpd(a, b, c, IMM8, k, SAE);
    transmute(r)
}

/// Fix up packed double-precision (64-bit) floating-point elements in a and b using packed 64-bit integers in c, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). imm8 is used to set the required flags reporting.\
///
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_fixupimm_round_pd&expand=2504)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfixupimmpd, IMM8 = 0, SAE = 8))]
#[rustc_legacy_const_generics(4, 5)]
pub unsafe fn _mm512_maskz_fixupimm_round_pd<const IMM8: i32, const SAE: i32>(
    k: __mmask8,
    a: __m512d,
    b: __m512d,
    c: __m512i,
) -> __m512d {
    static_assert_imm8!(IMM8);
    static_assert_mantissas_sae!(SAE);
    let a = a.as_f64x8();
    let b = b.as_f64x8();
    let c = c.as_i64x8();
    let r = vfixupimmpdz(a, b, c, IMM8, k, SAE);
    transmute(r)
}

/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
///    _MM_MANT_NORM_1_2     // interval [1, 2)\
///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
/// The sign is determined by sc which can take the following values:\
///    _MM_MANT_SIGN_src     // sign = sign(src)\
///    _MM_MANT_SIGN_zero    // sign = 0\
///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getmant_round_ps&expand=2886)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0, SAE = 4))]
#[rustc_legacy_const_generics(1, 2, 3)]
pub unsafe fn _mm512_getmant_round_ps<
    const NORM: _MM_MANTISSA_NORM_ENUM,
    const SIGN: _MM_MANTISSA_SIGN_ENUM,
    const SAE: i32,
>(
    a: __m512,
) -> __m512 {
    static_assert_imm4!(NORM);
    static_assert_imm2!(SIGN);
    static_assert_mantissas_sae!(SAE);
    let a = a.as_f32x16();
    let zero = _mm512_setzero_ps().as_f32x16();
    let r = vgetmantps(a, SIGN << 2 | NORM, zero, 0b11111111_11111111, SAE);
    transmute(r)
}

/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
///    _MM_MANT_NORM_1_2     // interval [1, 2)\
///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
/// The sign is determined by sc which can take the following values:\
///    _MM_MANT_SIGN_src     // sign = sign(src)\
///    _MM_MANT_SIGN_zero    // sign = 0\
///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getmant_round_ps&expand=2887)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0, SAE = 4))]
#[rustc_legacy_const_generics(3, 4, 5)]
pub unsafe fn _mm512_mask_getmant_round_ps<
    const NORM: _MM_MANTISSA_NORM_ENUM,
    const SIGN: _MM_MANTISSA_SIGN_ENUM,
    const SAE: i32,
>(
    src: __m512,
    k: __mmask16,
    a: __m512,
) -> __m512 {
    static_assert_imm4!(NORM);
    static_assert_imm2!(SIGN);
    static_assert_mantissas_sae!(SAE);
    let a = a.as_f32x16();
    let src = src.as_f32x16();
    let r = vgetmantps(a, SIGN << 2 | NORM, src, k, SAE);
    transmute(r)
}

/// Normalize the mantissas of packed single-precision (32-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
///    _MM_MANT_NORM_1_2     // interval [1, 2)\
///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
/// The sign is determined by sc which can take the following values:\
///    _MM_MANT_SIGN_src     // sign = sign(src)\
///    _MM_MANT_SIGN_zero    // sign = 0\
///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_getmant_round_ps&expand=2888)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgetmantps, NORM = 0, SIGN = 0, SAE = 4))]
#[rustc_legacy_const_generics(2, 3, 4)]
pub unsafe fn _mm512_maskz_getmant_round_ps<
    const NORM: _MM_MANTISSA_NORM_ENUM,
    const SIGN: _MM_MANTISSA_SIGN_ENUM,
    const SAE: i32,
>(
    k: __mmask16,
    a: __m512,
) -> __m512 {
    static_assert_imm4!(NORM);
    static_assert_imm2!(SIGN);
    static_assert_mantissas_sae!(SAE);
    let a = a.as_f32x16();
    let zero = _mm512_setzero_ps().as_f32x16();
    let r = vgetmantps(a, SIGN << 2 | NORM, zero, k, SAE);
    transmute(r)
}

/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
///    _MM_MANT_NORM_1_2     // interval [1, 2)\
///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
/// The sign is determined by sc which can take the following values:\
///    _MM_MANT_SIGN_src     // sign = sign(src)\
///    _MM_MANT_SIGN_zero    // sign = 0\
///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_getmant_round_pd&expand=2883)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0, SAE = 4))]
#[rustc_legacy_const_generics(1, 2, 3)]
pub unsafe fn _mm512_getmant_round_pd<
    const NORM: _MM_MANTISSA_NORM_ENUM,
    const SIGN: _MM_MANTISSA_SIGN_ENUM,
    const SAE: i32,
>(
    a: __m512d,
) -> __m512d {
    static_assert_imm4!(NORM);
    static_assert_imm2!(SIGN);
    static_assert_mantissas_sae!(SAE);
    let a = a.as_f64x8();
    let zero = _mm512_setzero_pd().as_f64x8();
    let r = vgetmantpd(a, SIGN << 2 | NORM, zero, 0b11111111, SAE);
    transmute(r)
}

/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
///    _MM_MANT_NORM_1_2     // interval [1, 2)\
///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
/// The sign is determined by sc which can take the following values:\
///    _MM_MANT_SIGN_src     // sign = sign(src)\
///    _MM_MANT_SIGN_zero    // sign = 0\
///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_getmant_round_pd&expand=2884)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0, SAE = 4))]
#[rustc_legacy_const_generics(3, 4, 5)]
pub unsafe fn _mm512_mask_getmant_round_pd<
    const NORM: _MM_MANTISSA_NORM_ENUM,
    const SIGN: _MM_MANTISSA_SIGN_ENUM,
    const SAE: i32,
>(
    src: __m512d,
    k: __mmask8,
    a: __m512d,
) -> __m512d {
    static_assert_imm4!(NORM);
    static_assert_imm2!(SIGN);
    static_assert_mantissas_sae!(SAE);
    let a = a.as_f64x8();
    let src = src.as_f64x8();
    let r = vgetmantpd(a, SIGN << 2 | NORM, src, k, SAE);
    transmute(r)
}

/// Normalize the mantissas of packed double-precision (64-bit) floating-point elements in a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set). This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
///    _MM_MANT_NORM_1_2     // interval [1, 2)\
///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
/// The sign is determined by sc which can take the following values:\
///    _MM_MANT_SIGN_src     // sign = sign(src)\
///    _MM_MANT_SIGN_zero    // sign = 0\
///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_512_maskz_getmant_round_pd&expand=2885)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgetmantpd, NORM = 0, SIGN = 0, SAE = 4))]
#[rustc_legacy_const_generics(2, 3, 4)]
pub unsafe fn _mm512_maskz_getmant_round_pd<
    const NORM: _MM_MANTISSA_NORM_ENUM,
    const SIGN: _MM_MANTISSA_SIGN_ENUM,
    const SAE: i32,
>(
    k: __mmask8,
    a: __m512d,
) -> __m512d {
    static_assert_imm4!(NORM);
    static_assert_imm2!(SIGN);
    static_assert_mantissas_sae!(SAE);
    let a = a.as_f64x8();
    let zero = _mm512_setzero_pd().as_f64x8();
    let r = vgetmantpd(a, SIGN << 2 | NORM, zero, k, SAE);
    transmute(r)
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtps_epi32&expand=1737)   
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtps2dq))]
pub unsafe fn _mm512_cvtps_epi32(a: __m512) -> __m512i {
    transmute(vcvtps2dq(
        a.as_f32x16(),
        _mm512_setzero_si512().as_i32x16(),
        0b11111111_11111111,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtps_epi32&expand=1738)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtps2dq))]
pub unsafe fn _mm512_mask_cvtps_epi32(src: __m512i, k: __mmask16, a: __m512) -> __m512i {
    transmute(vcvtps2dq(
        a.as_f32x16(),
        src.as_i32x16(),
        k,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtps_epi32&expand=1739)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtps2dq))]
pub unsafe fn _mm512_maskz_cvtps_epi32(k: __mmask16, a: __m512) -> __m512i {
    transmute(vcvtps2dq(
        a.as_f32x16(),
        _mm512_setzero_si512().as_i32x16(),
        k,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtps_epi32&expand=1735)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvtps2dq))]
pub unsafe fn _mm256_mask_cvtps_epi32(src: __m256i, k: __mmask8, a: __m256) -> __m256i {
    let convert = _mm256_cvtps_epi32(a);
    transmute(simd_select_bitmask(k, convert.as_i32x8(), src.as_i32x8()))
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtps_epi32&expand=1736)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvtps2dq))]
pub unsafe fn _mm256_maskz_cvtps_epi32(k: __mmask8, a: __m256) -> __m256i {
    let convert = _mm256_cvtps_epi32(a);
    let zero = _mm256_setzero_si256().as_i32x8();
    transmute(simd_select_bitmask(k, convert.as_i32x8(), zero))
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtps_epi32&expand=1732)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvtps2dq))]
pub unsafe fn _mm_mask_cvtps_epi32(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
    let convert = _mm_cvtps_epi32(a);
    transmute(simd_select_bitmask(k, convert.as_i32x4(), src.as_i32x4()))
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtps_epi32&expand=1733)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvtps2dq))]
pub unsafe fn _mm_maskz_cvtps_epi32(k: __mmask8, a: __m128) -> __m128i {
    let convert = _mm_cvtps_epi32(a);
    let zero = _mm_setzero_si128().as_i32x4();
    transmute(simd_select_bitmask(k, convert.as_i32x4(), zero))
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.
///    
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtps_epu32&expand=1755)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtps2udq))]
pub unsafe fn _mm512_cvtps_epu32(a: __m512) -> __m512i {
    transmute(vcvtps2udq(
        a.as_f32x16(),
        _mm512_setzero_si512().as_u32x16(),
        0b11111111_11111111,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///    
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtps_epu32&expand=1756)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtps2udq))]
pub unsafe fn _mm512_mask_cvtps_epu32(src: __m512i, k: __mmask16, a: __m512) -> __m512i {
    transmute(vcvtps2udq(
        a.as_f32x16(),
        src.as_u32x16(),
        k,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///    
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundps_epu32&expand=1343)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtps2udq))]
pub unsafe fn _mm512_maskz_cvtps_epu32(k: __mmask16, a: __m512) -> __m512i {
    transmute(vcvtps2udq(
        a.as_f32x16(),
        _mm512_setzero_si512().as_u32x16(),
        k,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.
///    
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtps_epu32&expand=1752)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvtps2udq))]
pub unsafe fn _mm256_cvtps_epu32(a: __m256) -> __m256i {
    transmute(vcvtps2udq256(
        a.as_f32x8(),
        _mm256_setzero_si256().as_u32x8(),
        0b11111111,
    ))
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///    
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtps_epu32&expand=1753)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvtps2udq))]
pub unsafe fn _mm256_mask_cvtps_epu32(src: __m256i, k: __mmask8, a: __m256) -> __m256i {
    transmute(vcvtps2udq256(a.as_f32x8(), src.as_u32x8(), k))
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///    
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtps_epu32&expand=1754)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvtps2udq))]
pub unsafe fn _mm256_maskz_cvtps_epu32(k: __mmask8, a: __m256) -> __m256i {
    transmute(vcvtps2udq256(
        a.as_f32x8(),
        _mm256_setzero_si256().as_u32x8(),
        k,
    ))
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.
///    
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_epu32&expand=1749)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvtps2udq))]
pub unsafe fn _mm_cvtps_epu32(a: __m128) -> __m128i {
    transmute(vcvtps2udq128(
        a.as_f32x4(),
        _mm_setzero_si128().as_u32x4(),
        0b11111111,
    ))
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///    
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtps_epu32&expand=1750)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvtps2udq))]
pub unsafe fn _mm_mask_cvtps_epu32(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
    transmute(vcvtps2udq128(a.as_f32x4(), src.as_u32x4(), k))
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///    
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtps_epu32&expand=1751)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvtps2udq))]
pub unsafe fn _mm_maskz_cvtps_epu32(k: __mmask8, a: __m128) -> __m128i {
    transmute(vcvtps2udq128(
        a.as_f32x4(),
        _mm_setzero_si128().as_u32x4(),
        k,
    ))
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtps_pd&expand=1769)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtps2pd))]
pub unsafe fn _mm512_cvtps_pd(a: __m256) -> __m512d {
    transmute(vcvtps2pd(
        a.as_f32x8(),
        _mm512_setzero_pd().as_f64x8(),
        0b11111111,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtps_pd&expand=1770)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtps2pd))]
pub unsafe fn _mm512_mask_cvtps_pd(src: __m512d, k: __mmask8, a: __m256) -> __m512d {
    transmute(vcvtps2pd(
        a.as_f32x8(),
        src.as_f64x8(),
        k,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtps_pd&expand=1771)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtps2pd))]
pub unsafe fn _mm512_maskz_cvtps_pd(k: __mmask8, a: __m256) -> __m512d {
    transmute(vcvtps2pd(
        a.as_f32x8(),
        _mm512_setzero_pd().as_f64x8(),
        k,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Performs element-by-element conversion of the lower half of packed single-precision (32-bit) floating-point elements in v2 to packed double-precision (64-bit) floating-point elements, storing the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtpslo_pd&expand=1784)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtps2pd))]
pub unsafe fn _mm512_cvtpslo_pd(v2: __m512) -> __m512d {
    transmute(vcvtps2pd(
        _mm512_castps512_ps256(v2).as_f32x8(),
        _mm512_setzero_pd().as_f64x8(),
        0b11111111,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Performs element-by-element conversion of the lower half of packed single-precision (32-bit) floating-point elements in v2 to packed double-precision (64-bit) floating-point elements, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtpslo_pd&expand=1785)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtps2pd))]
pub unsafe fn _mm512_mask_cvtpslo_pd(src: __m512d, k: __mmask8, v2: __m512) -> __m512d {
    transmute(vcvtps2pd(
        _mm512_castps512_ps256(v2).as_f32x8(),
        src.as_f64x8(),
        k,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtpd_ps&expand=1712)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtpd2ps))]
pub unsafe fn _mm512_cvtpd_ps(a: __m512d) -> __m256 {
    transmute(vcvtpd2ps(
        a.as_f64x8(),
        _mm256_setzero_ps().as_f32x8(),
        0b11111111,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtpd_ps&expand=1713)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtpd2ps))]
pub unsafe fn _mm512_mask_cvtpd_ps(src: __m256, k: __mmask8, a: __m512d) -> __m256 {
    transmute(vcvtpd2ps(
        a.as_f64x8(),
        src.as_f32x8(),
        k,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtpd_ps&expand=1714)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtpd2ps))]
pub unsafe fn _mm512_maskz_cvtpd_ps(k: __mmask8, a: __m512d) -> __m256 {
    transmute(vcvtpd2ps(
        a.as_f64x8(),
        _mm256_setzero_ps().as_f32x8(),
        k,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtpd_ps&expand=1710)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvtpd2ps))]
pub unsafe fn _mm256_mask_cvtpd_ps(src: __m128, k: __mmask8, a: __m256d) -> __m128 {
    let convert = _mm256_cvtpd_ps(a);
    transmute(simd_select_bitmask(k, convert.as_f32x4(), src.as_f32x4()))
}

/// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtpd_ps&expand=1711)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvtpd2ps))]
pub unsafe fn _mm256_maskz_cvtpd_ps(k: __mmask8, a: __m256d) -> __m128 {
    let convert = _mm256_cvtpd_ps(a);
    let zero = _mm_setzero_ps().as_f32x4();
    transmute(simd_select_bitmask(k, convert.as_f32x4(), zero))
}

/// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtpd_ps&expand=1707)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvtpd2ps))]
pub unsafe fn _mm_mask_cvtpd_ps(src: __m128, k: __mmask8, a: __m128d) -> __m128 {
    let convert = _mm_cvtpd_ps(a);
    transmute(simd_select_bitmask(k, convert.as_f32x4(), src.as_f32x4()))
}

/// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtpd_ps&expand=1708)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvtpd2ps))]
pub unsafe fn _mm_maskz_cvtpd_ps(k: __mmask8, a: __m128d) -> __m128 {
    let convert = _mm_cvtpd_ps(a);
    let zero = _mm_setzero_ps().as_f32x4();
    transmute(simd_select_bitmask(k, convert.as_f32x4(), zero))
}

/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtpd_epi32&expand=1675)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtpd2dq))]
pub unsafe fn _mm512_cvtpd_epi32(a: __m512d) -> __m256i {
    transmute(vcvtpd2dq(
        a.as_f64x8(),
        _mm256_setzero_si256().as_i32x8(),
        0b11111111,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtpd_epi32&expand=1676)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtpd2dq))]
pub unsafe fn _mm512_mask_cvtpd_epi32(src: __m256i, k: __mmask8, a: __m512d) -> __m256i {
    transmute(vcvtpd2dq(
        a.as_f64x8(),
        src.as_i32x8(),
        k,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtpd_epi32&expand=1677)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtpd2dq))]
pub unsafe fn _mm512_maskz_cvtpd_epi32(k: __mmask8, a: __m512d) -> __m256i {
    transmute(vcvtpd2dq(
        a.as_f64x8(),
        _mm256_setzero_si256().as_i32x8(),
        k,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtpd_epi32&expand=1673)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvtpd2dq))]
pub unsafe fn _mm256_mask_cvtpd_epi32(src: __m128i, k: __mmask8, a: __m256d) -> __m128i {
    let convert = _mm256_cvtpd_epi32(a);
    transmute(simd_select_bitmask(k, convert.as_i32x4(), src.as_i32x4()))
}

/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtpd_epi32&expand=1674)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvtpd2dq))]
pub unsafe fn _mm256_maskz_cvtpd_epi32(k: __mmask8, a: __m256d) -> __m128i {
    let convert = _mm256_cvtpd_epi32(a);
    transmute(simd_select_bitmask(
        k,
        convert.as_i32x4(),
        _mm_setzero_si128().as_i32x4(),
    ))
}

/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtpd_epi32&expand=1670)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvtpd2dq))]
pub unsafe fn _mm_mask_cvtpd_epi32(src: __m128i, k: __mmask8, a: __m128d) -> __m128i {
    let convert = _mm_cvtpd_epi32(a);
    transmute(simd_select_bitmask(k, convert.as_i32x4(), src.as_i32x4()))
}

/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtpd_epi32&expand=1671)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvtpd2dq))]
pub unsafe fn _mm_maskz_cvtpd_epi32(k: __mmask8, a: __m128d) -> __m128i {
    let convert = _mm_cvtpd_epi32(a);
    transmute(simd_select_bitmask(
        k,
        convert.as_i32x4(),
        _mm_setzero_si128().as_i32x4(),
    ))
}

/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtpd_epu32&expand=1693)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtpd2udq))]
pub unsafe fn _mm512_cvtpd_epu32(a: __m512d) -> __m256i {
    transmute(vcvtpd2udq(
        a.as_f64x8(),
        _mm256_setzero_si256().as_u32x8(),
        0b11111111,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtpd_epu32&expand=1694)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtpd2udq))]
pub unsafe fn _mm512_mask_cvtpd_epu32(src: __m256i, k: __mmask8, a: __m512d) -> __m256i {
    transmute(vcvtpd2udq(
        a.as_f64x8(),
        src.as_u32x8(),
        k,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtpd_epu32&expand=1695)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtpd2udq))]
pub unsafe fn _mm512_maskz_cvtpd_epu32(k: __mmask8, a: __m512d) -> __m256i {
    transmute(vcvtpd2udq(
        a.as_f64x8(),
        _mm256_setzero_si256().as_u32x8(),
        k,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtpd_epu32&expand=1690)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvtpd2udq))]
pub unsafe fn _mm256_cvtpd_epu32(a: __m256d) -> __m128i {
    transmute(vcvtpd2udq256(
        a.as_f64x4(),
        _mm_setzero_si128().as_u32x4(),
        0b11111111,
    ))
}

/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtpd_epu32&expand=1691)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvtpd2udq))]
pub unsafe fn _mm256_mask_cvtpd_epu32(src: __m128i, k: __mmask8, a: __m256d) -> __m128i {
    transmute(vcvtpd2udq256(a.as_f64x4(), src.as_u32x4(), k))
}

/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtpd_epu32&expand=1692)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvtpd2udq))]
pub unsafe fn _mm256_maskz_cvtpd_epu32(k: __mmask8, a: __m256d) -> __m128i {
    transmute(vcvtpd2udq256(
        a.as_f64x4(),
        _mm_setzero_si128().as_u32x4(),
        k,
    ))
}

/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_epu32&expand=1687)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvtpd2udq))]
pub unsafe fn _mm_cvtpd_epu32(a: __m128d) -> __m128i {
    transmute(vcvtpd2udq128(
        a.as_f64x2(),
        _mm_setzero_si128().as_u32x4(),
        0b11111111,
    ))
}

/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtpd_epu32&expand=1688)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvtpd2udq))]
pub unsafe fn _mm_mask_cvtpd_epu32(src: __m128i, k: __mmask8, a: __m128d) -> __m128i {
    transmute(vcvtpd2udq128(a.as_f64x2(), src.as_u32x4(), k))
}

/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtpd_epu32&expand=1689)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvtpd2udq))]
pub unsafe fn _mm_maskz_cvtpd_epu32(k: __mmask8, a: __m128d) -> __m128i {
    transmute(vcvtpd2udq128(
        a.as_f64x2(),
        _mm_setzero_si128().as_u32x4(),
        k,
    ))
}

/// Performs an element-by-element conversion of packed double-precision (64-bit) floating-point elements in v2 to single-precision (32-bit) floating-point elements and stores them in dst. The elements are stored in the lower half of the results vector, while the remaining upper half locations are set to 0.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtpd_pslo&expand=1715)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtpd2ps))]
pub unsafe fn _mm512_cvtpd_pslo(v2: __m512d) -> __m512 {
    let r: f32x8 = vcvtpd2ps(
        v2.as_f64x8(),
        _mm256_setzero_ps().as_f32x8(),
        0b11111111,
        _MM_FROUND_CUR_DIRECTION,
    );
    simd_shuffle16!(
        r,
        _mm256_setzero_ps().as_f32x8(),
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8],
    )
}

/// Performs an element-by-element conversion of packed double-precision (64-bit) floating-point elements in v2 to single-precision (32-bit) floating-point elements and stores them in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). The elements are stored in the lower half of the results vector, while the remaining upper half locations are set to 0.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtpd_pslo&expand=1716)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtpd2ps))]
pub unsafe fn _mm512_mask_cvtpd_pslo(src: __m512, k: __mmask8, v2: __m512d) -> __m512 {
    let r: f32x8 = vcvtpd2ps(
        v2.as_f64x8(),
        _mm512_castps512_ps256(src).as_f32x8(),
        k,
        _MM_FROUND_CUR_DIRECTION,
    );
    simd_shuffle16!(
        r,
        _mm256_setzero_ps().as_f32x8(),
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8],
    )
}

/// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi8_epi32&expand=1535)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovsxbd))]
pub unsafe fn _mm512_cvtepi8_epi32(a: __m128i) -> __m512i {
    let a = a.as_i8x16();
    transmute::<i32x16, _>(simd_cast(a))
}

/// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi8_epi32&expand=1536)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovsxbd))]
pub unsafe fn _mm512_mask_cvtepi8_epi32(src: __m512i, k: __mmask16, a: __m128i) -> __m512i {
    let convert = _mm512_cvtepi8_epi32(a).as_i32x16();
    transmute(simd_select_bitmask(k, convert, src.as_i32x16()))
}

/// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi8_epi32&expand=1537)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovsxbd))]
pub unsafe fn _mm512_maskz_cvtepi8_epi32(k: __mmask16, a: __m128i) -> __m512i {
    let convert = _mm512_cvtepi8_epi32(a).as_i32x16();
    let zero = _mm512_setzero_si512().as_i32x16();
    transmute(simd_select_bitmask(k, convert, zero))
}

/// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi8_epi32&expand=1533)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsxbd))]
pub unsafe fn _mm256_mask_cvtepi8_epi32(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
    let convert = _mm256_cvtepi8_epi32(a).as_i32x8();
    transmute(simd_select_bitmask(k, convert, src.as_i32x8()))
}

/// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi8_epi32&expand=1534)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsxbd))]
pub unsafe fn _mm256_maskz_cvtepi8_epi32(k: __mmask8, a: __m128i) -> __m256i {
    let convert = _mm256_cvtepi8_epi32(a).as_i32x8();
    let zero = _mm256_setzero_si256().as_i32x8();
    transmute(simd_select_bitmask(k, convert, zero))
}

/// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi8_epi32&expand=1530)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsxbd))]
pub unsafe fn _mm_mask_cvtepi8_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
    let convert = _mm_cvtepi8_epi32(a).as_i32x4();
    transmute(simd_select_bitmask(k, convert, src.as_i32x4()))
}

/// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi8_epi32&expand=1531)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsxbd))]
pub unsafe fn _mm_maskz_cvtepi8_epi32(k: __mmask8, a: __m128i) -> __m128i {
    let convert = _mm_cvtepi8_epi32(a).as_i32x4();
    let zero = _mm_setzero_si128().as_i32x4();
    transmute(simd_select_bitmask(k, convert, zero))
}

/// Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi8_epi64&expand=1544)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovsxbq))]
pub unsafe fn _mm512_cvtepi8_epi64(a: __m128i) -> __m512i {
    let a = a.as_i8x16();
    let v64: i8x8 = simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
    transmute::<i64x8, _>(simd_cast(v64))
}

/// Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi8_epi64&expand=1545)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovsxbq))]
pub unsafe fn _mm512_mask_cvtepi8_epi64(src: __m512i, k: __mmask8, a: __m128i) -> __m512i {
    let convert = _mm512_cvtepi8_epi64(a).as_i64x8();
    transmute(simd_select_bitmask(k, convert, src.as_i64x8()))
}

/// Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi8_epi64&expand=1546)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovsxbq))]
pub unsafe fn _mm512_maskz_cvtepi8_epi64(k: __mmask8, a: __m128i) -> __m512i {
    let convert = _mm512_cvtepi8_epi64(a).as_i64x8();
    let zero = _mm512_setzero_si512().as_i64x8();
    transmute(simd_select_bitmask(k, convert, zero))
}

/// Sign extend packed 8-bit integers in the low 4 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi8_epi64&expand=1542)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsxbq))]
pub unsafe fn _mm256_mask_cvtepi8_epi64(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
    let convert = _mm256_cvtepi8_epi64(a).as_i64x4();
    transmute(simd_select_bitmask(k, convert, src.as_i64x4()))
}

/// Sign extend packed 8-bit integers in the low 4 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi8_epi64&expand=1543)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsxbq))]
pub unsafe fn _mm256_maskz_cvtepi8_epi64(k: __mmask8, a: __m128i) -> __m256i {
    let convert = _mm256_cvtepi8_epi64(a).as_i64x4();
    let zero = _mm256_setzero_si256().as_i64x4();
    transmute(simd_select_bitmask(k, convert, zero))
}

/// Sign extend packed 8-bit integers in the low 2 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi8_epi64&expand=1539)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsxbq))]
pub unsafe fn _mm_mask_cvtepi8_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
    let convert = _mm_cvtepi8_epi64(a).as_i64x2();
    transmute(simd_select_bitmask(k, convert, src.as_i64x2()))
}

/// Sign extend packed 8-bit integers in the low 2 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi8_epi64&expand=1540)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsxbq))]
pub unsafe fn _mm_maskz_cvtepi8_epi64(k: __mmask8, a: __m128i) -> __m128i {
    let convert = _mm_cvtepi8_epi64(a).as_i64x2();
    let zero = _mm_setzero_si128().as_i64x2();
    transmute(simd_select_bitmask(k, convert, zero))
}

/// Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu8_epi32&expand=1621)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovzxbd))]
pub unsafe fn _mm512_cvtepu8_epi32(a: __m128i) -> __m512i {
    let a = a.as_u8x16();
    transmute::<i32x16, _>(simd_cast(a))
}

/// Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepu8_epi32&expand=1622)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovzxbd))]
pub unsafe fn _mm512_mask_cvtepu8_epi32(src: __m512i, k: __mmask16, a: __m128i) -> __m512i {
    let convert = _mm512_cvtepu8_epi32(a).as_i32x16();
    transmute(simd_select_bitmask(k, convert, src.as_i32x16()))
}

/// Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepu8_epi32&expand=1623)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovzxbd))]
pub unsafe fn _mm512_maskz_cvtepu8_epi32(k: __mmask16, a: __m128i) -> __m512i {
    let convert = _mm512_cvtepu8_epi32(a).as_i32x16();
    let zero = _mm512_setzero_si512().as_i32x16();
    transmute(simd_select_bitmask(k, convert, zero))
}

/// Zero extend packed unsigned 8-bit integers in the low 8 bytes of a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepu8_epi32&expand=1619)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovzxbd))]
pub unsafe fn _mm256_mask_cvtepu8_epi32(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
    let convert = _mm256_cvtepu8_epi32(a).as_i32x8();
    transmute(simd_select_bitmask(k, convert, src.as_i32x8()))
}

/// Zero extend packed unsigned 8-bit integers in the low 8 bytes of a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/IntrinsicsGuide/#text=_mm256_maskz_cvtepu8_epi32&expand=1620)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovzxbd))]
pub unsafe fn _mm256_maskz_cvtepu8_epi32(k: __mmask8, a: __m128i) -> __m256i {
    let convert = _mm256_cvtepu8_epi32(a).as_i32x8();
    let zero = _mm256_setzero_si256().as_i32x8();
    transmute(simd_select_bitmask(k, convert, zero))
}

/// Zero extend packed unsigned 8-bit integers in the low 4 bytes of a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepu8_epi32&expand=1616)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovzxbd))]
pub unsafe fn _mm_mask_cvtepu8_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
    let convert = _mm_cvtepu8_epi32(a).as_i32x4();
    transmute(simd_select_bitmask(k, convert, src.as_i32x4()))
}

/// Zero extend packed unsigned 8-bit integers in th elow 4 bytes of a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/IntrinsicsGuide/#text=_mm_maskz_cvtepu8_epi32&expand=1617)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovzxbd))]
pub unsafe fn _mm_maskz_cvtepu8_epi32(k: __mmask8, a: __m128i) -> __m128i {
    let convert = _mm_cvtepu8_epi32(a).as_i32x4();
    let zero = _mm_setzero_si128().as_i32x4();
    transmute(simd_select_bitmask(k, convert, zero))
}

/// Zero extend packed unsigned 8-bit integers in the low 8 byte sof a to packed 64-bit integers, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu8_epi64&expand=1630)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovzxbq))]
pub unsafe fn _mm512_cvtepu8_epi64(a: __m128i) -> __m512i {
    let a = a.as_u8x16();
    let v64: u8x8 = simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
    transmute::<i64x8, _>(simd_cast(v64))
}

/// Zero extend packed unsigned 8-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepu8_epi64&expand=1631)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovzxbq))]
pub unsafe fn _mm512_mask_cvtepu8_epi64(src: __m512i, k: __mmask8, a: __m128i) -> __m512i {
    let convert = _mm512_cvtepu8_epi64(a).as_i64x8();
    transmute(simd_select_bitmask(k, convert, src.as_i64x8()))
}

/// Zero extend packed unsigned 8-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepu8_epi64&expand=1632)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovzxbq))]
pub unsafe fn _mm512_maskz_cvtepu8_epi64(k: __mmask8, a: __m128i) -> __m512i {
    let convert = _mm512_cvtepu8_epi64(a).as_i64x8();
    let zero = _mm512_setzero_si512().as_i64x8();
    transmute(simd_select_bitmask(k, convert, zero))
}

/// Zero extend packed unsigned 8-bit integers in the low 4 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepu8_epi64&expand=1628)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovzxbq))]
pub unsafe fn _mm256_mask_cvtepu8_epi64(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
    let convert = _mm256_cvtepu8_epi64(a).as_i64x4();
    transmute(simd_select_bitmask(k, convert, src.as_i64x4()))
}

/// Zero extend packed unsigned 8-bit integers in the low 4 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepu8_epi64&expand=1629)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovzxbq))]
pub unsafe fn _mm256_maskz_cvtepu8_epi64(k: __mmask8, a: __m128i) -> __m256i {
    let convert = _mm256_cvtepu8_epi64(a).as_i64x4();
    let zero = _mm256_setzero_si256().as_i64x4();
    transmute(simd_select_bitmask(k, convert, zero))
}

/// Zero extend packed unsigned 8-bit integers in the low 2 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepu8_epi64&expand=1625)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovzxbq))]
pub unsafe fn _mm_mask_cvtepu8_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
    let convert = _mm_cvtepu8_epi64(a).as_i64x2();
    transmute(simd_select_bitmask(k, convert, src.as_i64x2()))
}

/// Zero extend packed unsigned 8-bit integers in the low 2 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepu8_epi64&expand=1626)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovzxbq))]
pub unsafe fn _mm_maskz_cvtepu8_epi64(k: __mmask8, a: __m128i) -> __m128i {
    let convert = _mm_cvtepu8_epi64(a).as_i64x2();
    let zero = _mm_setzero_si128().as_i64x2();
    transmute(simd_select_bitmask(k, convert, zero))
}

/// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi16_epi32&expand=1389)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovsxwd))]
pub unsafe fn _mm512_cvtepi16_epi32(a: __m256i) -> __m512i {
    let a = a.as_i16x16();
    transmute::<i32x16, _>(simd_cast(a))
}

/// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi16_epi32&expand=1390)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovsxwd))]
pub unsafe fn _mm512_mask_cvtepi16_epi32(src: __m512i, k: __mmask16, a: __m256i) -> __m512i {
    let convert = _mm512_cvtepi16_epi32(a).as_i32x16();
    transmute(simd_select_bitmask(k, convert, src.as_i32x16()))
}

/// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi16_epi32&expand=1391)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovsxwd))]
pub unsafe fn _mm512_maskz_cvtepi16_epi32(k: __mmask16, a: __m256i) -> __m512i {
    let convert = _mm512_cvtepi16_epi32(a).as_i32x16();
    let zero = _mm512_setzero_si512().as_i32x16();
    transmute(simd_select_bitmask(k, convert, zero))
}

/// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi16_epi32&expand=1387)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsxwd))]
pub unsafe fn _mm256_mask_cvtepi16_epi32(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
    let convert = _mm256_cvtepi16_epi32(a).as_i32x8();
    transmute(simd_select_bitmask(k, convert, src.as_i32x8()))
}

/// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi16_epi32&expand=1388)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsxwd))]
pub unsafe fn _mm256_maskz_cvtepi16_epi32(k: __mmask8, a: __m128i) -> __m256i {
    let convert = _mm256_cvtepi16_epi32(a).as_i32x8();
    let zero = _mm256_setzero_si256().as_i32x8();
    transmute(simd_select_bitmask(k, convert, zero))
}

/// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi16_epi32&expand=1384)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsxwd))]
pub unsafe fn _mm_mask_cvtepi16_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
    let convert = _mm_cvtepi16_epi32(a).as_i32x4();
    transmute(simd_select_bitmask(k, convert, src.as_i32x4()))
}

/// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi16_epi32&expand=1385)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsxwd))]
pub unsafe fn _mm_maskz_cvtepi16_epi32(k: __mmask8, a: __m128i) -> __m128i {
    let convert = _mm_cvtepi16_epi32(a).as_i32x4();
    let zero = _mm_setzero_si128().as_i32x4();
    transmute(simd_select_bitmask(k, convert, zero))
}

/// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi16_epi64&expand=1398)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovsxwq))]
pub unsafe fn _mm512_cvtepi16_epi64(a: __m128i) -> __m512i {
    let a = a.as_i16x8();
    transmute::<i64x8, _>(simd_cast(a))
}

/// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi16_epi64&expand=1399)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovsxwq))]
pub unsafe fn _mm512_mask_cvtepi16_epi64(src: __m512i, k: __mmask8, a: __m128i) -> __m512i {
    let convert = _mm512_cvtepi16_epi64(a).as_i64x8();
    transmute(simd_select_bitmask(k, convert, src.as_i64x8()))
}

/// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi16_epi64&expand=1400)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovsxwq))]
pub unsafe fn _mm512_maskz_cvtepi16_epi64(k: __mmask8, a: __m128i) -> __m512i {
    let convert = _mm512_cvtepi16_epi64(a).as_i64x8();
    let zero = _mm512_setzero_si512().as_i64x8();
    transmute(simd_select_bitmask(k, convert, zero))
}

/// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi16_epi64&expand=1396)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsxwq))]
pub unsafe fn _mm256_mask_cvtepi16_epi64(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
    let convert = _mm256_cvtepi16_epi64(a).as_i64x4();
    transmute(simd_select_bitmask(k, convert, src.as_i64x4()))
}

/// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi16_epi64&expand=1397)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsxwq))]
pub unsafe fn _mm256_maskz_cvtepi16_epi64(k: __mmask8, a: __m128i) -> __m256i {
    let convert = _mm256_cvtepi16_epi64(a).as_i64x4();
    let zero = _mm256_setzero_si256().as_i64x4();
    transmute(simd_select_bitmask(k, convert, zero))
}

/// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi16_epi64&expand=1393)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsxwq))]
pub unsafe fn _mm_mask_cvtepi16_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
    let convert = _mm_cvtepi16_epi64(a).as_i64x2();
    transmute(simd_select_bitmask(k, convert, src.as_i64x2()))
}

/// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi16_epi64&expand=1394)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsxwq))]
pub unsafe fn _mm_maskz_cvtepi16_epi64(k: __mmask8, a: __m128i) -> __m128i {
    let convert = _mm_cvtepi16_epi64(a).as_i64x2();
    let zero = _mm_setzero_si128().as_i64x2();
    transmute(simd_select_bitmask(k, convert, zero))
}

/// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu16_epi32&expand=1553)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovzxwd))]
pub unsafe fn _mm512_cvtepu16_epi32(a: __m256i) -> __m512i {
    let a = a.as_u16x16();
    transmute::<i32x16, _>(simd_cast(a))
}

/// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepu16_epi32&expand=1554)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovzxwd))]
pub unsafe fn _mm512_mask_cvtepu16_epi32(src: __m512i, k: __mmask16, a: __m256i) -> __m512i {
    let convert = _mm512_cvtepu16_epi32(a).as_i32x16();
    transmute(simd_select_bitmask(k, convert, src.as_i32x16()))
}

/// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepu16_epi32&expand=1555)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovzxwd))]
pub unsafe fn _mm512_maskz_cvtepu16_epi32(k: __mmask16, a: __m256i) -> __m512i {
    let convert = _mm512_cvtepu16_epi32(a).as_i32x16();
    let zero = _mm512_setzero_si512().as_i32x16();
    transmute(simd_select_bitmask(k, convert, zero))
}

/// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepu16_epi32&expand=1551)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovzxwd))]
pub unsafe fn _mm256_mask_cvtepu16_epi32(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
    let convert = _mm256_cvtepu16_epi32(a).as_i32x8();
    transmute(simd_select_bitmask(k, convert, src.as_i32x8()))
}

/// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepu16_epi32&expand=1552)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovzxwd))]
pub unsafe fn _mm256_maskz_cvtepu16_epi32(k: __mmask8, a: __m128i) -> __m256i {
    let convert = _mm256_cvtepu16_epi32(a).as_i32x8();
    let zero = _mm256_setzero_si256().as_i32x8();
    transmute(simd_select_bitmask(k, convert, zero))
}

/// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepu16_epi32&expand=1548)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovzxwd))]
pub unsafe fn _mm_mask_cvtepu16_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
    let convert = _mm_cvtepu16_epi32(a).as_i32x4();
    transmute(simd_select_bitmask(k, convert, src.as_i32x4()))
}

/// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepu16_epi32&expand=1549)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovzxwd))]
pub unsafe fn _mm_maskz_cvtepu16_epi32(k: __mmask8, a: __m128i) -> __m128i {
    let convert = _mm_cvtepu16_epi32(a).as_i32x4();
    let zero = _mm_setzero_si128().as_i32x4();
    transmute(simd_select_bitmask(k, convert, zero))
}

/// Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu16_epi64&expand=1562)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovzxwq))]
pub unsafe fn _mm512_cvtepu16_epi64(a: __m128i) -> __m512i {
    let a = a.as_u16x8();
    transmute::<i64x8, _>(simd_cast(a))
}

/// Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepu16_epi64&expand=1563)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovzxwq))]
pub unsafe fn _mm512_mask_cvtepu16_epi64(src: __m512i, k: __mmask8, a: __m128i) -> __m512i {
    let convert = _mm512_cvtepu16_epi64(a).as_i64x8();
    transmute(simd_select_bitmask(k, convert, src.as_i64x8()))
}

/// Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepu16_epi64&expand=1564)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovzxwq))]
pub unsafe fn _mm512_maskz_cvtepu16_epi64(k: __mmask8, a: __m128i) -> __m512i {
    let convert = _mm512_cvtepu16_epi64(a).as_i64x8();
    let zero = _mm512_setzero_si512().as_i64x8();
    transmute(simd_select_bitmask(k, convert, zero))
}

/// Zero extend packed unsigned 16-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepu16_epi64&expand=1560)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovzxwq))]
pub unsafe fn _mm256_mask_cvtepu16_epi64(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
    let convert = _mm256_cvtepu16_epi64(a).as_i64x4();
    transmute(simd_select_bitmask(k, convert, src.as_i64x4()))
}

/// Zero extend packed unsigned 16-bit integers in the low 8 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepu16_epi64&expand=1561)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovzxwq))]
pub unsafe fn _mm256_maskz_cvtepu16_epi64(k: __mmask8, a: __m128i) -> __m256i {
    let convert = _mm256_cvtepu16_epi64(a).as_i64x4();
    let zero = _mm256_setzero_si256().as_i64x4();
    transmute(simd_select_bitmask(k, convert, zero))
}

/// Zero extend packed unsigned 16-bit integers in the low 4 bytes of a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepu16_epi64&expand=1557)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovzxwq))]
pub unsafe fn _mm_mask_cvtepu16_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
    let convert = _mm_cvtepu16_epi64(a).as_i64x2();
    transmute(simd_select_bitmask(k, convert, src.as_i64x2()))
}

/// Zero extend packed unsigned 16-bit integers in the low 4 bytes of a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepu16_epi64&expand=1558)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovzxwq))]
pub unsafe fn _mm_maskz_cvtepu16_epi64(k: __mmask8, a: __m128i) -> __m128i {
    let convert = _mm_cvtepu16_epi64(a).as_i64x2();
    let zero = _mm_setzero_si128().as_i64x2();
    transmute(simd_select_bitmask(k, convert, zero))
}

/// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi32_epi64&expand=1428)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovsxdq))]
pub unsafe fn _mm512_cvtepi32_epi64(a: __m256i) -> __m512i {
    let a = a.as_i32x8();
    transmute::<i64x8, _>(simd_cast(a))
}

/// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi32_epi64&expand=1429)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovsxdq))]
pub unsafe fn _mm512_mask_cvtepi32_epi64(src: __m512i, k: __mmask8, a: __m256i) -> __m512i {
    let convert = _mm512_cvtepi32_epi64(a).as_i64x8();
    transmute(simd_select_bitmask(k, convert, src.as_i64x8()))
}

/// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi32_epi64&expand=1430)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovsxdq))]
pub unsafe fn _mm512_maskz_cvtepi32_epi64(k: __mmask8, a: __m256i) -> __m512i {
    let convert = _mm512_cvtepi32_epi64(a).as_i64x8();
    let zero = _mm512_setzero_si512().as_i64x8();
    transmute(simd_select_bitmask(k, convert, zero))
}

/// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi32_epi64&expand=1426)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsxdq))]
pub unsafe fn _mm256_mask_cvtepi32_epi64(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
    let convert = _mm256_cvtepi32_epi64(a).as_i64x4();
    transmute(simd_select_bitmask(k, convert, src.as_i64x4()))
}

/// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi32_epi64&expand=1427)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsxdq))]
pub unsafe fn _mm256_maskz_cvtepi32_epi64(k: __mmask8, a: __m128i) -> __m256i {
    let convert = _mm256_cvtepi32_epi64(a).as_i64x4();
    let zero = _mm256_setzero_si256().as_i64x4();
    transmute(simd_select_bitmask(k, convert, zero))
}

/// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi32_epi64&expand=1423)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsxdq))]
pub unsafe fn _mm_mask_cvtepi32_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
    let convert = _mm_cvtepi32_epi64(a).as_i64x2();
    transmute(simd_select_bitmask(k, convert, src.as_i64x2()))
}

/// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi32_epi64&expand=1424)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsxdq))]
pub unsafe fn _mm_maskz_cvtepi32_epi64(k: __mmask8, a: __m128i) -> __m128i {
    let convert = _mm_cvtepi32_epi64(a).as_i64x2();
    let zero = _mm_setzero_si128().as_i64x2();
    transmute(simd_select_bitmask(k, convert, zero))
}

/// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu32_epi64&expand=1571)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovzxdq))]
pub unsafe fn _mm512_cvtepu32_epi64(a: __m256i) -> __m512i {
    let a = a.as_u32x8();
    transmute::<i64x8, _>(simd_cast(a))
}

/// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepu32_epi64&expand=1572)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovzxdq))]
pub unsafe fn _mm512_mask_cvtepu32_epi64(src: __m512i, k: __mmask8, a: __m256i) -> __m512i {
    let convert = _mm512_cvtepu32_epi64(a).as_i64x8();
    transmute(simd_select_bitmask(k, convert, src.as_i64x8()))
}

/// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepu32_epi64&expand=1573)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovzxdq))]
pub unsafe fn _mm512_maskz_cvtepu32_epi64(k: __mmask8, a: __m256i) -> __m512i {
    let convert = _mm512_cvtepu32_epi64(a).as_i64x8();
    let zero = _mm512_setzero_si512().as_i64x8();
    transmute(simd_select_bitmask(k, convert, zero))
}

/// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepu32_epi64&expand=1569)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovzxdq))]
pub unsafe fn _mm256_mask_cvtepu32_epi64(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
    let convert = _mm256_cvtepu32_epi64(a).as_i64x4();
    transmute(simd_select_bitmask(k, convert, src.as_i64x4()))
}

/// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepu32_epi64&expand=1570)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovzxdq))]
pub unsafe fn _mm256_maskz_cvtepu32_epi64(k: __mmask8, a: __m128i) -> __m256i {
    let convert = _mm256_cvtepu32_epi64(a).as_i64x4();
    let zero = _mm256_setzero_si256().as_i64x4();
    transmute(simd_select_bitmask(k, convert, zero))
}

/// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepu32_epi64&expand=1566)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovzxdq))]
pub unsafe fn _mm_mask_cvtepu32_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
    let convert = _mm_cvtepu32_epi64(a).as_i64x2();
    transmute(simd_select_bitmask(k, convert, src.as_i64x2()))
}

/// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepu32_epi64&expand=1567)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovzxdq))]
pub unsafe fn _mm_maskz_cvtepu32_epi64(k: __mmask8, a: __m128i) -> __m128i {
    let convert = _mm_cvtepu32_epi64(a).as_i64x2();
    let zero = _mm_setzero_si128().as_i64x2();
    transmute(simd_select_bitmask(k, convert, zero))
}

/// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi32_ps&expand=1455)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtdq2ps))]
pub unsafe fn _mm512_cvtepi32_ps(a: __m512i) -> __m512 {
    let a = a.as_i32x16();
    transmute::<f32x16, _>(simd_cast(a))
}

/// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi32_ps&expand=1456)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtdq2ps))]
pub unsafe fn _mm512_mask_cvtepi32_ps(src: __m512, k: __mmask16, a: __m512i) -> __m512 {
    let convert = _mm512_cvtepi32_ps(a).as_f32x16();
    transmute(simd_select_bitmask(k, convert, src.as_f32x16()))
}

/// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi32_ps&expand=1457)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtdq2ps))]
pub unsafe fn _mm512_maskz_cvtepi32_ps(k: __mmask16, a: __m512i) -> __m512 {
    let convert = _mm512_cvtepi32_ps(a).as_f32x16();
    let zero = _mm512_setzero_ps().as_f32x16();
    transmute(simd_select_bitmask(k, convert, zero))
}

/// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi32_ps&expand=1453)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvtdq2ps))]
pub unsafe fn _mm256_mask_cvtepi32_ps(src: __m256, k: __mmask8, a: __m256i) -> __m256 {
    let convert = _mm256_cvtepi32_ps(a).as_f32x8();
    transmute(simd_select_bitmask(k, convert, src.as_f32x8()))
}

/// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi32_ps&expand=1454)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvtdq2ps))]
pub unsafe fn _mm256_maskz_cvtepi32_ps(k: __mmask8, a: __m256i) -> __m256 {
    let convert = _mm256_cvtepi32_ps(a).as_f32x8();
    let zero = _mm256_setzero_ps().as_f32x8();
    transmute(simd_select_bitmask(k, convert, zero))
}

/// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi32_ps&expand=1450)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvtdq2ps))]
pub unsafe fn _mm_mask_cvtepi32_ps(src: __m128, k: __mmask8, a: __m128i) -> __m128 {
    let convert = _mm_cvtepi32_ps(a).as_f32x4();
    transmute(simd_select_bitmask(k, convert, src.as_f32x4()))
}

/// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi32_ps&expand=1451)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvtdq2ps))]
pub unsafe fn _mm_maskz_cvtepi32_ps(k: __mmask8, a: __m128i) -> __m128 {
    let convert = _mm_cvtepi32_ps(a).as_f32x4();
    let zero = _mm_setzero_ps().as_f32x4();
    transmute(simd_select_bitmask(k, convert, zero))
}

/// Convert packed signed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi32_pd&expand=1446)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtdq2pd))]
pub unsafe fn _mm512_cvtepi32_pd(a: __m256i) -> __m512d {
    let a = a.as_i32x8();
    transmute::<f64x8, _>(simd_cast(a))
}

/// Convert packed signed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi32_pd&expand=1447)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtdq2pd))]
pub unsafe fn _mm512_mask_cvtepi32_pd(src: __m512d, k: __mmask8, a: __m256i) -> __m512d {
    let convert = _mm512_cvtepi32_pd(a).as_f64x8();
    transmute(simd_select_bitmask(k, convert, src.as_f64x8()))
}

/// Convert packed signed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi32_pd&expand=1448)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtdq2pd))]
pub unsafe fn _mm512_maskz_cvtepi32_pd(k: __mmask8, a: __m256i) -> __m512d {
    let convert = _mm512_cvtepi32_pd(a).as_f64x8();
    let zero = _mm512_setzero_pd().as_f64x8();
    transmute(simd_select_bitmask(k, convert, zero))
}

/// Convert packed signed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi32_pd&expand=1444)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvtdq2pd))]
pub unsafe fn _mm256_mask_cvtepi32_pd(src: __m256d, k: __mmask8, a: __m128i) -> __m256d {
    let convert = _mm256_cvtepi32_pd(a).as_f64x4();
    transmute(simd_select_bitmask(k, convert, src.as_f64x4()))
}

/// Convert packed signed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi32_pd&expand=1445)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvtdq2pd))]
pub unsafe fn _mm256_maskz_cvtepi32_pd(k: __mmask8, a: __m128i) -> __m256d {
    let convert = _mm256_cvtepi32_pd(a).as_f64x4();
    let zero = _mm256_setzero_pd().as_f64x4();
    transmute(simd_select_bitmask(k, convert, zero))
}

/// Convert packed signed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi32_pd&expand=1441)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvtdq2pd))]
pub unsafe fn _mm_mask_cvtepi32_pd(src: __m128d, k: __mmask8, a: __m128i) -> __m128d {
    let convert = _mm_cvtepi32_pd(a).as_f64x2();
    transmute(simd_select_bitmask(k, convert, src.as_f64x2()))
}

/// Convert packed signed 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi32_pd&expand=1442)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvtdq2pd))]
pub unsafe fn _mm_maskz_cvtepi32_pd(k: __mmask8, a: __m128i) -> __m128d {
    let convert = _mm_cvtepi32_pd(a).as_f64x2();
    let zero = _mm_setzero_pd().as_f64x2();
    transmute(simd_select_bitmask(k, convert, zero))
}

/// Convert packed unsigned 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu32_ps&expand=1583)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtudq2ps))]
pub unsafe fn _mm512_cvtepu32_ps(a: __m512i) -> __m512 {
    let a = a.as_u32x16();
    transmute::<f32x16, _>(simd_cast(a))
}

/// Convert packed unsigned 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepu32_ps&expand=1584)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtudq2ps))]
pub unsafe fn _mm512_mask_cvtepu32_ps(src: __m512, k: __mmask16, a: __m512i) -> __m512 {
    let convert = _mm512_cvtepu32_ps(a).as_f32x16();
    transmute(simd_select_bitmask(k, convert, src.as_f32x16()))
}

/// Convert packed unsigned 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepu32_ps&expand=1585)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtudq2ps))]
pub unsafe fn _mm512_maskz_cvtepu32_ps(k: __mmask16, a: __m512i) -> __m512 {
    let convert = _mm512_cvtepu32_ps(a).as_f32x16();
    let zero = _mm512_setzero_ps().as_f32x16();
    transmute(simd_select_bitmask(k, convert, zero))
}

/// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu32_pd&expand=1580)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtudq2pd))]
pub unsafe fn _mm512_cvtepu32_pd(a: __m256i) -> __m512d {
    let a = a.as_u32x8();
    transmute::<f64x8, _>(simd_cast(a))
}

/// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepu32_pd&expand=1581)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtudq2pd))]
pub unsafe fn _mm512_mask_cvtepu32_pd(src: __m512d, k: __mmask8, a: __m256i) -> __m512d {
    let convert = _mm512_cvtepu32_pd(a).as_f64x8();
    transmute(simd_select_bitmask(k, convert, src.as_f64x8()))
}

/// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepu32_pd&expand=1582)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtudq2pd))]
pub unsafe fn _mm512_maskz_cvtepu32_pd(k: __mmask8, a: __m256i) -> __m512d {
    let convert = _mm512_cvtepu32_pd(a).as_f64x8();
    let zero = _mm512_setzero_pd().as_f64x8();
    transmute(simd_select_bitmask(k, convert, zero))
}

/// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepu32_pd&expand=1577)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvtudq2pd))]
pub unsafe fn _mm256_cvtepu32_pd(a: __m128i) -> __m256d {
    let a = a.as_u32x4();
    transmute::<f64x4, _>(simd_cast(a))
}

/// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepu32_pd&expand=1578)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvtudq2pd))]
pub unsafe fn _mm256_mask_cvtepu32_pd(src: __m256d, k: __mmask8, a: __m128i) -> __m256d {
    let convert = _mm256_cvtepu32_pd(a).as_f64x4();
    transmute(simd_select_bitmask(k, convert, src.as_f64x4()))
}

/// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepu32_pd&expand=1579)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvtudq2pd))]
pub unsafe fn _mm256_maskz_cvtepu32_pd(k: __mmask8, a: __m128i) -> __m256d {
    let convert = _mm256_cvtepu32_pd(a).as_f64x4();
    let zero = _mm256_setzero_pd().as_f64x4();
    transmute(simd_select_bitmask(k, convert, zero))
}

/// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepu32_pd&expand=1574)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvtudq2pd))]
pub unsafe fn _mm_cvtepu32_pd(a: __m128i) -> __m128d {
    let a = a.as_u32x4();
    let u64: u32x2 = simd_shuffle2!(a, a, [0, 1]);
    transmute::<f64x2, _>(simd_cast(u64))
}

/// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepu32_pd&expand=1575)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvtudq2pd))]
pub unsafe fn _mm_mask_cvtepu32_pd(src: __m128d, k: __mmask8, a: __m128i) -> __m128d {
    let convert = _mm_cvtepu32_pd(a).as_f64x2();
    transmute(simd_select_bitmask(k, convert, src.as_f64x2()))
}

/// Convert packed unsigned 32-bit integers in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepu32_pd&expand=1576)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvtudq2pd))]
pub unsafe fn _mm_maskz_cvtepu32_pd(k: __mmask8, a: __m128i) -> __m128d {
    let convert = _mm_cvtepu32_pd(a).as_f64x2();
    let zero = _mm_setzero_pd().as_f64x2();
    transmute(simd_select_bitmask(k, convert, zero))
}

/// Performs element-by-element conversion of the lower half of packed 32-bit integer elements in v2 to packed double-precision (64-bit) floating-point elements, storing the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi32lo_pd&expand=1464)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtdq2pd))]
pub unsafe fn _mm512_cvtepi32lo_pd(v2: __m512i) -> __m512d {
    let v2 = v2.as_i32x16();
    let v256: i32x8 = simd_shuffle8!(v2, v2, [0, 1, 2, 3, 4, 5, 6, 7]);
    transmute::<f64x8, _>(simd_cast(v256))
}

/// Performs element-by-element conversion of the lower half of packed 32-bit integer elements in v2 to packed double-precision (64-bit) floating-point elements, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi32lo_pd&expand=1465)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtdq2pd))]
pub unsafe fn _mm512_mask_cvtepi32lo_pd(src: __m512d, k: __mmask8, v2: __m512i) -> __m512d {
    let convert = _mm512_cvtepi32lo_pd(v2).as_f64x8();
    transmute(simd_select_bitmask(k, convert, src.as_f64x8()))
}

/// Performs element-by-element conversion of the lower half of packed 32-bit unsigned integer elements in v2 to packed double-precision (64-bit) floating-point elements, storing the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepu32lo_pd&expand=1586)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtudq2pd))]
pub unsafe fn _mm512_cvtepu32lo_pd(v2: __m512i) -> __m512d {
    let v2 = v2.as_u32x16();
    let v256: u32x8 = simd_shuffle8!(v2, v2, [0, 1, 2, 3, 4, 5, 6, 7]);
    transmute::<f64x8, _>(simd_cast(v256))
}

/// Performs element-by-element conversion of the lower half of 32-bit unsigned integer elements in v2 to packed double-precision (64-bit) floating-point elements, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepu32lo_pd&expand=1587)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtudq2pd))]
pub unsafe fn _mm512_mask_cvtepu32lo_pd(src: __m512d, k: __mmask8, v2: __m512i) -> __m512d {
    let convert = _mm512_cvtepu32lo_pd(v2).as_f64x8();
    transmute(simd_select_bitmask(k, convert, src.as_f64x8()))
}

/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi32_epi16&expand=1419)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovdw))]
pub unsafe fn _mm512_cvtepi32_epi16(a: __m512i) -> __m256i {
    let a = a.as_i32x16();
    transmute::<i16x16, _>(simd_cast(a))
}

/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi32_epi16&expand=1420)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovdw))]
pub unsafe fn _mm512_mask_cvtepi32_epi16(src: __m256i, k: __mmask16, a: __m512i) -> __m256i {
    let convert = _mm512_cvtepi32_epi16(a).as_i16x16();
    transmute(simd_select_bitmask(k, convert, src.as_i16x16()))
}

/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi32_epi16&expand=1421)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovdw))]
pub unsafe fn _mm512_maskz_cvtepi32_epi16(k: __mmask16, a: __m512i) -> __m256i {
    let convert = _mm512_cvtepi32_epi16(a).as_i16x16();
    let zero = _mm256_setzero_si256().as_i16x16();
    transmute(simd_select_bitmask(k, convert, zero))
}

/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi32_epi16&expand=1416)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovdw))]
pub unsafe fn _mm256_cvtepi32_epi16(a: __m256i) -> __m128i {
    let a = a.as_i32x8();
    transmute::<i16x8, _>(simd_cast(a))
}

/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi32_epi16&expand=1417)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovdw))]
pub unsafe fn _mm256_mask_cvtepi32_epi16(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
    let convert = _mm256_cvtepi32_epi16(a).as_i16x8();
    transmute(simd_select_bitmask(k, convert, src.as_i16x8()))
}

/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi32_epi16&expand=1418)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovdw))]
pub unsafe fn _mm256_maskz_cvtepi32_epi16(k: __mmask8, a: __m256i) -> __m128i {
    let convert = _mm256_cvtepi32_epi16(a).as_i16x8();
    let zero = _mm_setzero_si128().as_i16x8();
    transmute(simd_select_bitmask(k, convert, zero))
}

/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_epi16&expand=1413)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovdw))]
pub unsafe fn _mm_cvtepi32_epi16(a: __m128i) -> __m128i {
    transmute(vpmovdw128(
        a.as_i32x4(),
        _mm_setzero_si128().as_i16x8(),
        0b11111111,
    ))
}

/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi32_epi16&expand=1414)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovdw))]
pub unsafe fn _mm_mask_cvtepi32_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
    transmute(vpmovdw128(a.as_i32x4(), src.as_i16x8(), k))
}

/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi32_epi16&expand=1415)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovdw))]
pub unsafe fn _mm_maskz_cvtepi32_epi16(k: __mmask8, a: __m128i) -> __m128i {
    transmute(vpmovdw128(a.as_i32x4(), _mm_setzero_si128().as_i16x8(), k))
}

/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi32_epi8&expand=1437)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovdb))]
pub unsafe fn _mm512_cvtepi32_epi8(a: __m512i) -> __m128i {
    let a = a.as_i32x16();
    transmute::<i8x16, _>(simd_cast(a))
}

/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi32_epi8&expand=1438)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovdb))]
pub unsafe fn _mm512_mask_cvtepi32_epi8(src: __m128i, k: __mmask16, a: __m512i) -> __m128i {
    let convert = _mm512_cvtepi32_epi8(a).as_i8x16();
    transmute(simd_select_bitmask(k, convert, src.as_i8x16()))
}

/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi32_epi8&expand=1439)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovdb))]
pub unsafe fn _mm512_maskz_cvtepi32_epi8(k: __mmask16, a: __m512i) -> __m128i {
    let convert = _mm512_cvtepi32_epi8(a).as_i8x16();
    let zero = _mm_setzero_si128().as_i8x16();
    transmute(simd_select_bitmask(k, convert, zero))
}

/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi32_epi8&expand=1434)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovdb))]
pub unsafe fn _mm256_cvtepi32_epi8(a: __m256i) -> __m128i {
    transmute(vpmovdb256(
        a.as_i32x8(),
        _mm_setzero_si128().as_i8x16(),
        0b11111111,
    ))
}

/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi32_epi8&expand=1435)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovdb))]
pub unsafe fn _mm256_mask_cvtepi32_epi8(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
    transmute(vpmovdb256(a.as_i32x8(), src.as_i8x16(), k))
}

/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi32_epi8&expand=1436)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovdb))]
pub unsafe fn _mm256_maskz_cvtepi32_epi8(k: __mmask8, a: __m256i) -> __m128i {
    transmute(vpmovdb256(a.as_i32x8(), _mm_setzero_si128().as_i8x16(), k))
}

/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_epi8&expand=1431)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovdb))]
pub unsafe fn _mm_cvtepi32_epi8(a: __m128i) -> __m128i {
    transmute(vpmovdb128(
        a.as_i32x4(),
        _mm_setzero_si128().as_i8x16(),
        0b11111111,
    ))
}

/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi32_epi8&expand=1432)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovdb))]
pub unsafe fn _mm_mask_cvtepi32_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
    transmute(vpmovdb128(a.as_i32x4(), src.as_i8x16(), k))
}

/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi32_epi8&expand=1433)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovdb))]
pub unsafe fn _mm_maskz_cvtepi32_epi8(k: __mmask8, a: __m128i) -> __m128i {
    transmute(vpmovdb128(a.as_i32x4(), _mm_setzero_si128().as_i8x16(), k))
}

/// Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi64_epi32&expand=1481)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovqd))]
pub unsafe fn _mm512_cvtepi64_epi32(a: __m512i) -> __m256i {
    let a = a.as_i64x8();
    transmute::<i32x8, _>(simd_cast(a))
}

/// Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi64_epi32&expand=1482)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovqd))]
pub unsafe fn _mm512_mask_cvtepi64_epi32(src: __m256i, k: __mmask8, a: __m512i) -> __m256i {
    let convert = _mm512_cvtepi64_epi32(a).as_i32x8();
    transmute(simd_select_bitmask(k, convert, src.as_i32x8()))
}

/// Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi64_epi32&expand=1483)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovqd))]
pub unsafe fn _mm512_maskz_cvtepi64_epi32(k: __mmask8, a: __m512i) -> __m256i {
    let convert = _mm512_cvtepi64_epi32(a).as_i32x8();
    let zero = _mm256_setzero_si256().as_i32x8();
    transmute(simd_select_bitmask(k, convert, zero))
}

/// Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi64_epi32&expand=1478)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovqd))]
pub unsafe fn _mm256_cvtepi64_epi32(a: __m256i) -> __m128i {
    let a = a.as_i64x4();
    transmute::<i32x4, _>(simd_cast(a))
}

/// Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi64_epi32&expand=1479)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovqd))]
pub unsafe fn _mm256_mask_cvtepi64_epi32(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
    let convert = _mm256_cvtepi64_epi32(a).as_i32x4();
    transmute(simd_select_bitmask(k, convert, src.as_i32x4()))
}

/// Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi64_epi32&expand=1480)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovqd))]
pub unsafe fn _mm256_maskz_cvtepi64_epi32(k: __mmask8, a: __m256i) -> __m128i {
    let convert = _mm256_cvtepi64_epi32(a).as_i32x4();
    let zero = _mm_setzero_si128().as_i32x4();
    transmute(simd_select_bitmask(k, convert, zero))
}

/// Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi64_epi32&expand=1475)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovqd))]
pub unsafe fn _mm_cvtepi64_epi32(a: __m128i) -> __m128i {
    transmute(vpmovqd128(
        a.as_i64x2(),
        _mm_setzero_si128().as_i32x4(),
        0b11111111,
    ))
}

/// Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi64_epi32&expand=1476)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovqd))]
pub unsafe fn _mm_mask_cvtepi64_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
    transmute(vpmovqd128(a.as_i64x2(), src.as_i32x4(), k))
}

/// Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi64_epi32&expand=1477)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovqd))]
pub unsafe fn _mm_maskz_cvtepi64_epi32(k: __mmask8, a: __m128i) -> __m128i {
    transmute(vpmovqd128(a.as_i64x2(), _mm_setzero_si128().as_i32x4(), k))
}

/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi64_epi16&expand=1472)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovqw))]
pub unsafe fn _mm512_cvtepi64_epi16(a: __m512i) -> __m128i {
    let a = a.as_i64x8();
    transmute::<i16x8, _>(simd_cast(a))
}

/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi64_epi16&expand=1473)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovqw))]
pub unsafe fn _mm512_mask_cvtepi64_epi16(src: __m128i, k: __mmask8, a: __m512i) -> __m128i {
    let convert = _mm512_cvtepi64_epi16(a).as_i16x8();
    transmute(simd_select_bitmask(k, convert, src.as_i16x8()))
}

/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi64_epi16&expand=1474)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovqw))]
pub unsafe fn _mm512_maskz_cvtepi64_epi16(k: __mmask8, a: __m512i) -> __m128i {
    let convert = _mm512_cvtepi64_epi16(a).as_i16x8();
    let zero = _mm_setzero_si128().as_i16x8();
    transmute(simd_select_bitmask(k, convert, zero))
}

/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi64_epi16&expand=1469)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovqw))]
pub unsafe fn _mm256_cvtepi64_epi16(a: __m256i) -> __m128i {
    transmute(vpmovqw256(
        a.as_i64x4(),
        _mm_setzero_si128().as_i16x8(),
        0b11111111,
    ))
}

/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi64_epi16&expand=1470)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovqw))]
pub unsafe fn _mm256_mask_cvtepi64_epi16(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
    transmute(vpmovqw256(a.as_i64x4(), src.as_i16x8(), k))
}

/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi64_epi16&expand=1471)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovqw))]
pub unsafe fn _mm256_maskz_cvtepi64_epi16(k: __mmask8, a: __m256i) -> __m128i {
    transmute(vpmovqw256(a.as_i64x4(), _mm_setzero_si128().as_i16x8(), k))
}

/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi64_epi16&expand=1466)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovqw))]
pub unsafe fn _mm_cvtepi64_epi16(a: __m128i) -> __m128i {
    transmute(vpmovqw128(
        a.as_i64x2(),
        _mm_setzero_si128().as_i16x8(),
        0b11111111,
    ))
}

/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi64_epi16&expand=1467)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovqw))]
pub unsafe fn _mm_mask_cvtepi64_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
    transmute(vpmovqw128(a.as_i64x2(), src.as_i16x8(), k))
}

/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi64_epi16&expand=1468)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovqw))]
pub unsafe fn _mm_maskz_cvtepi64_epi16(k: __mmask8, a: __m128i) -> __m128i {
    transmute(vpmovqw128(a.as_i64x2(), _mm_setzero_si128().as_i16x8(), k))
}

/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtepi64_epi8&expand=1490)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovqb))]
pub unsafe fn _mm512_cvtepi64_epi8(a: __m512i) -> __m128i {
    transmute(vpmovqb(
        a.as_i64x8(),
        _mm_setzero_si128().as_i8x16(),
        0b11111111,
    ))
}

/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi64_epi8&expand=1491)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovqb))]
pub unsafe fn _mm512_mask_cvtepi64_epi8(src: __m128i, k: __mmask8, a: __m512i) -> __m128i {
    transmute(vpmovqb(a.as_i64x8(), src.as_i8x16(), k))
}

/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtepi64_epi8&expand=1492)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovqb))]
pub unsafe fn _mm512_maskz_cvtepi64_epi8(k: __mmask8, a: __m512i) -> __m128i {
    transmute(vpmovqb(a.as_i64x8(), _mm_setzero_si128().as_i8x16(), k))
}

/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtepi64_epi8&expand=1487)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovqb))]
pub unsafe fn _mm256_cvtepi64_epi8(a: __m256i) -> __m128i {
    transmute(vpmovqb256(
        a.as_i64x4(),
        _mm_setzero_si128().as_i8x16(),
        0b11111111,
    ))
}

/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi64_epi8&expand=1488)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovqb))]
pub unsafe fn _mm256_mask_cvtepi64_epi8(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
    transmute(vpmovqb256(a.as_i64x4(), src.as_i8x16(), k))
}

/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtepi64_epi8&expand=1489)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovqb))]
pub unsafe fn _mm256_maskz_cvtepi64_epi8(k: __mmask8, a: __m256i) -> __m128i {
    transmute(vpmovqb256(a.as_i64x4(), _mm_setzero_si128().as_i8x16(), k))
}

/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi64_epi8&expand=1484)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovqb))]
pub unsafe fn _mm_cvtepi64_epi8(a: __m128i) -> __m128i {
    transmute(vpmovqb128(
        a.as_i64x2(),
        _mm_setzero_si128().as_i8x16(),
        0b11111111,
    ))
}

/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi64_epi8&expand=1485)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovqb))]
pub unsafe fn _mm_mask_cvtepi64_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
    transmute(vpmovqb128(a.as_i64x2(), src.as_i8x16(), k))
}

/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtepi64_epi8&expand=1486)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovqb))]
pub unsafe fn _mm_maskz_cvtepi64_epi8(k: __mmask8, a: __m128i) -> __m128i {
    transmute(vpmovqb128(a.as_i64x2(), _mm_setzero_si128().as_i8x16(), k))
}

/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtsepi32_epi16&expand=1819)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovsdw))]
pub unsafe fn _mm512_cvtsepi32_epi16(a: __m512i) -> __m256i {
    transmute(vpmovsdw(
        a.as_i32x16(),
        _mm256_setzero_si256().as_i16x16(),
        0b11111111_11111111,
    ))
}

/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtsepi32_epi16&expand=1820)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovsdw))]
pub unsafe fn _mm512_mask_cvtsepi32_epi16(src: __m256i, k: __mmask16, a: __m512i) -> __m256i {
    transmute(vpmovsdw(a.as_i32x16(), src.as_i16x16(), k))
}

/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtsepi32_epi16&expand=1819)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovsdw))]
pub unsafe fn _mm512_maskz_cvtsepi32_epi16(k: __mmask16, a: __m512i) -> __m256i {
    transmute(vpmovsdw(
        a.as_i32x16(),
        _mm256_setzero_si256().as_i16x16(),
        k,
    ))
}

/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtsepi32_epi16&expand=1816)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsdw))]
pub unsafe fn _mm256_cvtsepi32_epi16(a: __m256i) -> __m128i {
    transmute(vpmovsdw256(
        a.as_i32x8(),
        _mm_setzero_si128().as_i16x8(),
        0b11111111,
    ))
}

/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtsepi32_epi16&expand=1817)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsdw))]
pub unsafe fn _mm256_mask_cvtsepi32_epi16(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
    transmute(vpmovsdw256(a.as_i32x8(), src.as_i16x8(), k))
}

/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtsepi32_epi16&expand=1818)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsdw))]
pub unsafe fn _mm256_maskz_cvtsepi32_epi16(k: __mmask8, a: __m256i) -> __m128i {
    transmute(vpmovsdw256(a.as_i32x8(), _mm_setzero_si128().as_i16x8(), k))
}

/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsepi32_epi16&expand=1813)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsdw))]
pub unsafe fn _mm_cvtsepi32_epi16(a: __m128i) -> __m128i {
    transmute(vpmovsdw128(
        a.as_i32x4(),
        _mm_setzero_si128().as_i16x8(),
        0b11111111,
    ))
}

/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsepi32_epi16&expand=1814)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsdw))]
pub unsafe fn _mm_mask_cvtsepi32_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
    transmute(vpmovsdw128(a.as_i32x4(), src.as_i16x8(), k))
}

/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsepi32_epi16&expand=1815)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsdw))]
pub unsafe fn _mm_maskz_cvtsepi32_epi16(k: __mmask8, a: __m128i) -> __m128i {
    transmute(vpmovsdw128(a.as_i32x4(), _mm_setzero_si128().as_i16x8(), k))
}

/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtsepi32_epi8&expand=1828)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovsdb))]
pub unsafe fn _mm512_cvtsepi32_epi8(a: __m512i) -> __m128i {
    transmute(vpmovsdb(
        a.as_i32x16(),
        _mm_setzero_si128().as_i8x16(),
        0b11111111_11111111,
    ))
}

/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtsepi32_epi8&expand=1829)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovsdb))]
pub unsafe fn _mm512_mask_cvtsepi32_epi8(src: __m128i, k: __mmask16, a: __m512i) -> __m128i {
    transmute(vpmovsdb(a.as_i32x16(), src.as_i8x16(), k))
}

/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtsepi32_epi8&expand=1830)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovsdb))]
pub unsafe fn _mm512_maskz_cvtsepi32_epi8(k: __mmask16, a: __m512i) -> __m128i {
    transmute(vpmovsdb(a.as_i32x16(), _mm_setzero_si128().as_i8x16(), k))
}

/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtsepi32_epi8&expand=1825)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsdb))]
pub unsafe fn _mm256_cvtsepi32_epi8(a: __m256i) -> __m128i {
    transmute(vpmovsdb256(
        a.as_i32x8(),
        _mm_setzero_si128().as_i8x16(),
        0b11111111,
    ))
}

/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtsepi32_epi8&expand=1826)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsdb))]
pub unsafe fn _mm256_mask_cvtsepi32_epi8(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
    transmute(vpmovsdb256(a.as_i32x8(), src.as_i8x16(), k))
}

/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtsepi32_epi8&expand=1827)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsdb))]
pub unsafe fn _mm256_maskz_cvtsepi32_epi8(k: __mmask8, a: __m256i) -> __m128i {
    transmute(vpmovsdb256(a.as_i32x8(), _mm_setzero_si128().as_i8x16(), k))
}

/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsepi32_epi8&expand=1822)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsdb))]
pub unsafe fn _mm_cvtsepi32_epi8(a: __m128i) -> __m128i {
    transmute(vpmovsdb128(
        a.as_i32x4(),
        _mm_setzero_si128().as_i8x16(),
        0b11111111,
    ))
}

/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsepi32_epi8&expand=1823)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsdb))]
pub unsafe fn _mm_mask_cvtsepi32_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
    transmute(vpmovsdb128(a.as_i32x4(), src.as_i8x16(), k))
}

/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsepi32_epi8&expand=1824)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsdb))]
pub unsafe fn _mm_maskz_cvtsepi32_epi8(k: __mmask8, a: __m128i) -> __m128i {
    transmute(vpmovsdb128(a.as_i32x4(), _mm_setzero_si128().as_i8x16(), k))
}

/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtsepi64_epi32&expand=1852)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovsqd))]
pub unsafe fn _mm512_cvtsepi64_epi32(a: __m512i) -> __m256i {
    transmute(vpmovsqd(
        a.as_i64x8(),
        _mm256_setzero_si256().as_i32x8(),
        0b11111111,
    ))
}

/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtsepi64_epi32&expand=1853)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovsqd))]
pub unsafe fn _mm512_mask_cvtsepi64_epi32(src: __m256i, k: __mmask8, a: __m512i) -> __m256i {
    transmute(vpmovsqd(a.as_i64x8(), src.as_i32x8(), k))
}

/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtsepi64_epi32&expand=1854)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovsqd))]
pub unsafe fn _mm512_maskz_cvtsepi64_epi32(k: __mmask8, a: __m512i) -> __m256i {
    transmute(vpmovsqd(a.as_i64x8(), _mm256_setzero_si256().as_i32x8(), k))
}

/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtsepi64_epi32&expand=1849)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsqd))]
pub unsafe fn _mm256_cvtsepi64_epi32(a: __m256i) -> __m128i {
    transmute(vpmovsqd256(
        a.as_i64x4(),
        _mm_setzero_si128().as_i32x4(),
        0b11111111,
    ))
}

/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtsepi64_epi32&expand=1850)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsqd))]
pub unsafe fn _mm256_mask_cvtsepi64_epi32(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
    transmute(vpmovsqd256(a.as_i64x4(), src.as_i32x4(), k))
}

/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtsepi64_epi32&expand=1851)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsqd))]
pub unsafe fn _mm256_maskz_cvtsepi64_epi32(k: __mmask8, a: __m256i) -> __m128i {
    transmute(vpmovsqd256(a.as_i64x4(), _mm_setzero_si128().as_i32x4(), k))
}

/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsepi64_epi32&expand=1846)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsqd))]
pub unsafe fn _mm_cvtsepi64_epi32(a: __m128i) -> __m128i {
    transmute(vpmovsqd128(
        a.as_i64x2(),
        _mm_setzero_si128().as_i32x4(),
        0b11111111,
    ))
}

/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsepi64_epi32&expand=1847)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsqd))]
pub unsafe fn _mm_mask_cvtsepi64_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
    transmute(vpmovsqd128(a.as_i64x2(), src.as_i32x4(), k))
}

/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsepi64_epi32&expand=1848)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsqd))]
pub unsafe fn _mm_maskz_cvtsepi64_epi32(k: __mmask8, a: __m128i) -> __m128i {
    transmute(vpmovsqd128(a.as_i64x2(), _mm_setzero_si128().as_i32x4(), k))
}

/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtsepi64_epi16&expand=1843)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovsqw))]
pub unsafe fn _mm512_cvtsepi64_epi16(a: __m512i) -> __m128i {
    transmute(vpmovsqw(
        a.as_i64x8(),
        _mm_setzero_si128().as_i16x8(),
        0b11111111,
    ))
}

/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtsepi64_epi16&expand=1844)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovsqw))]
pub unsafe fn _mm512_mask_cvtsepi64_epi16(src: __m128i, k: __mmask8, a: __m512i) -> __m128i {
    transmute(vpmovsqw(a.as_i64x8(), src.as_i16x8(), k))
}

/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtsepi64_epi16&expand=1845)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovsqw))]
pub unsafe fn _mm512_maskz_cvtsepi64_epi16(k: __mmask8, a: __m512i) -> __m128i {
    transmute(vpmovsqw(a.as_i64x8(), _mm_setzero_si128().as_i16x8(), k))
}

/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtsepi64_epi16&expand=1840)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsqw))]
pub unsafe fn _mm256_cvtsepi64_epi16(a: __m256i) -> __m128i {
    transmute(vpmovsqw256(
        a.as_i64x4(),
        _mm_setzero_si128().as_i16x8(),
        0b11111111,
    ))
}

/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtsepi64_epi16&expand=1841)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsqw))]
pub unsafe fn _mm256_mask_cvtsepi64_epi16(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
    transmute(vpmovsqw256(a.as_i64x4(), src.as_i16x8(), k))
}

/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtsepi64_epi16&expand=1842)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsqw))]
pub unsafe fn _mm256_maskz_cvtsepi64_epi16(k: __mmask8, a: __m256i) -> __m128i {
    transmute(vpmovsqw256(a.as_i64x4(), _mm_setzero_si128().as_i16x8(), k))
}

/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsepi64_epi16&expand=1837)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsqw))]
pub unsafe fn _mm_cvtsepi64_epi16(a: __m128i) -> __m128i {
    transmute(vpmovsqw128(
        a.as_i64x2(),
        _mm_setzero_si128().as_i16x8(),
        0b11111111,
    ))
}

/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsepi64_epi16&expand=1838)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsqw))]
pub unsafe fn _mm_mask_cvtsepi64_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
    transmute(vpmovsqw128(a.as_i64x2(), src.as_i16x8(), k))
}

/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsepi64_epi16&expand=1839)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsqw))]
pub unsafe fn _mm_maskz_cvtsepi64_epi16(k: __mmask8, a: __m128i) -> __m128i {
    transmute(vpmovsqw128(a.as_i64x2(), _mm_setzero_si128().as_i16x8(), k))
}

/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtsepi64_epi8&expand=1861)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovsqb))]
pub unsafe fn _mm512_cvtsepi64_epi8(a: __m512i) -> __m128i {
    transmute(vpmovsqb(
        a.as_i64x8(),
        _mm_setzero_si128().as_i8x16(),
        0b11111111,
    ))
}

/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtsepi64_epi8&expand=1862)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovsqb))]
pub unsafe fn _mm512_mask_cvtsepi64_epi8(src: __m128i, k: __mmask8, a: __m512i) -> __m128i {
    transmute(vpmovsqb(a.as_i64x8(), src.as_i8x16(), k))
}

/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtsepi64_epi8&expand=1863)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovsqb))]
pub unsafe fn _mm512_maskz_cvtsepi64_epi8(k: __mmask8, a: __m512i) -> __m128i {
    transmute(vpmovsqb(a.as_i64x8(), _mm_setzero_si128().as_i8x16(), k))
}

/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtsepi64_epi8&expand=1858)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsqb))]
pub unsafe fn _mm256_cvtsepi64_epi8(a: __m256i) -> __m128i {
    transmute(vpmovsqb256(
        a.as_i64x4(),
        _mm_setzero_si128().as_i8x16(),
        0b11111111,
    ))
}

/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtsepi64_epi8&expand=1859)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsqb))]
pub unsafe fn _mm256_mask_cvtsepi64_epi8(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
    transmute(vpmovsqb256(a.as_i64x4(), src.as_i8x16(), k))
}

/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtsepi64_epi8&expand=1860)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsqb))]
pub unsafe fn _mm256_maskz_cvtsepi64_epi8(k: __mmask8, a: __m256i) -> __m128i {
    transmute(vpmovsqb256(a.as_i64x4(), _mm_setzero_si128().as_i8x16(), k))
}

/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsepi64_epi8&expand=1855)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsqb))]
pub unsafe fn _mm_cvtsepi64_epi8(a: __m128i) -> __m128i {
    transmute(vpmovsqb128(
        a.as_i64x2(),
        _mm_setzero_si128().as_i8x16(),
        0b11111111,
    ))
}

/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsepi64_epi8&expand=1856)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsqb))]
pub unsafe fn _mm_mask_cvtsepi64_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
    transmute(vpmovsqb128(a.as_i64x2(), src.as_i8x16(), k))
}

/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtsepi64_epi8&expand=1857)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsqb))]
pub unsafe fn _mm_maskz_cvtsepi64_epi8(k: __mmask8, a: __m128i) -> __m128i {
    transmute(vpmovsqb128(a.as_i64x2(), _mm_setzero_si128().as_i8x16(), k))
}

/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtusepi32_epi16&expand=2054)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovusdw))]
pub unsafe fn _mm512_cvtusepi32_epi16(a: __m512i) -> __m256i {
    transmute(vpmovusdw(
        a.as_u32x16(),
        _mm256_setzero_si256().as_u16x16(),
        0b11111111_11111111,
    ))
}

/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtusepi32_epi16&expand=2055)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovusdw))]
pub unsafe fn _mm512_mask_cvtusepi32_epi16(src: __m256i, k: __mmask16, a: __m512i) -> __m256i {
    transmute(vpmovusdw(a.as_u32x16(), src.as_u16x16(), k))
}

/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtusepi32_epi16&expand=2056)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovusdw))]
pub unsafe fn _mm512_maskz_cvtusepi32_epi16(k: __mmask16, a: __m512i) -> __m256i {
    transmute(vpmovusdw(
        a.as_u32x16(),
        _mm256_setzero_si256().as_u16x16(),
        k,
    ))
}

/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtusepi32_epi16&expand=2051)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovusdw))]
pub unsafe fn _mm256_cvtusepi32_epi16(a: __m256i) -> __m128i {
    transmute(vpmovusdw256(
        a.as_u32x8(),
        _mm_setzero_si128().as_u16x8(),
        0b11111111,
    ))
}

/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtusepi32_epi16&expand=2052)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovusdw))]
pub unsafe fn _mm256_mask_cvtusepi32_epi16(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
    transmute(vpmovusdw256(a.as_u32x8(), src.as_u16x8(), k))
}

/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtusepi32_epi16&expand=2053)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovusdw))]
pub unsafe fn _mm256_maskz_cvtusepi32_epi16(k: __mmask8, a: __m256i) -> __m128i {
    transmute(vpmovusdw256(
        a.as_u32x8(),
        _mm_setzero_si128().as_u16x8(),
        k,
    ))
}

/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtusepi32_epi16&expand=2048)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovusdw))]
pub unsafe fn _mm_cvtusepi32_epi16(a: __m128i) -> __m128i {
    transmute(vpmovusdw128(
        a.as_u32x4(),
        _mm_setzero_si128().as_u16x8(),
        0b11111111,
    ))
}

/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtusepi32_epi16&expand=2049)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovusdw))]
pub unsafe fn _mm_mask_cvtusepi32_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
    transmute(vpmovusdw128(a.as_u32x4(), src.as_u16x8(), k))
}

/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtusepi32_epi16&expand=2050)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovusdw))]
pub unsafe fn _mm_maskz_cvtusepi32_epi16(k: __mmask8, a: __m128i) -> __m128i {
    transmute(vpmovusdw128(
        a.as_u32x4(),
        _mm_setzero_si128().as_u16x8(),
        k,
    ))
}

/// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtusepi32_epi8&expand=2063)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovusdb))]
pub unsafe fn _mm512_cvtusepi32_epi8(a: __m512i) -> __m128i {
    transmute(vpmovusdb(
        a.as_u32x16(),
        _mm_setzero_si128().as_u8x16(),
        0b11111111_11111111,
    ))
}

/// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtusepi32_epi8&expand=2064)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovusdb))]
pub unsafe fn _mm512_mask_cvtusepi32_epi8(src: __m128i, k: __mmask16, a: __m512i) -> __m128i {
    transmute(vpmovusdb(a.as_u32x16(), src.as_u8x16(), k))
}

/// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtusepi32_epi8&expand=2065)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovusdb))]
pub unsafe fn _mm512_maskz_cvtusepi32_epi8(k: __mmask16, a: __m512i) -> __m128i {
    transmute(vpmovusdb(a.as_u32x16(), _mm_setzero_si128().as_u8x16(), k))
}

/// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtusepi32_epi8&expand=2060)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovusdb))]
pub unsafe fn _mm256_cvtusepi32_epi8(a: __m256i) -> __m128i {
    transmute(vpmovusdb256(
        a.as_u32x8(),
        _mm_setzero_si128().as_u8x16(),
        0b11111111,
    ))
}

/// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtusepi32_epi8&expand=2061)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovusdb))]
pub unsafe fn _mm256_mask_cvtusepi32_epi8(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
    transmute(vpmovusdb256(a.as_u32x8(), src.as_u8x16(), k))
}

/// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtusepi32_epi8&expand=2062)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovusdb))]
pub unsafe fn _mm256_maskz_cvtusepi32_epi8(k: __mmask8, a: __m256i) -> __m128i {
    transmute(vpmovusdb256(
        a.as_u32x8(),
        _mm_setzero_si128().as_u8x16(),
        k,
    ))
}

/// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtusepi32_epi8&expand=2057)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovusdb))]
pub unsafe fn _mm_cvtusepi32_epi8(a: __m128i) -> __m128i {
    transmute(vpmovusdb128(
        a.as_u32x4(),
        _mm_setzero_si128().as_u8x16(),
        0b11111111,
    ))
}

/// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtusepi32_epi8&expand=2058)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovusdb))]
pub unsafe fn _mm_mask_cvtusepi32_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
    transmute(vpmovusdb128(a.as_u32x4(), src.as_u8x16(), k))
}

/// Convert packed unsigned 32-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtusepi32_epi8&expand=2059)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovusdb))]
pub unsafe fn _mm_maskz_cvtusepi32_epi8(k: __mmask8, a: __m128i) -> __m128i {
    transmute(vpmovusdb128(
        a.as_u32x4(),
        _mm_setzero_si128().as_u8x16(),
        k,
    ))
}

/// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtusepi64_epi32&expand=2087)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovusqd))]
pub unsafe fn _mm512_cvtusepi64_epi32(a: __m512i) -> __m256i {
    transmute(vpmovusqd(
        a.as_u64x8(),
        _mm256_setzero_si256().as_u32x8(),
        0b11111111,
    ))
}

/// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtusepi64_epi32&expand=2088)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovusqd))]
pub unsafe fn _mm512_mask_cvtusepi64_epi32(src: __m256i, k: __mmask8, a: __m512i) -> __m256i {
    transmute(vpmovusqd(a.as_u64x8(), src.as_u32x8(), k))
}

/// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtusepi64_epi32&expand=2089)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovusqd))]
pub unsafe fn _mm512_maskz_cvtusepi64_epi32(k: __mmask8, a: __m512i) -> __m256i {
    transmute(vpmovusqd(
        a.as_u64x8(),
        _mm256_setzero_si256().as_u32x8(),
        k,
    ))
}

/// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtusepi64_epi32&expand=2084)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovusqd))]
pub unsafe fn _mm256_cvtusepi64_epi32(a: __m256i) -> __m128i {
    transmute(vpmovusqd256(
        a.as_u64x4(),
        _mm_setzero_si128().as_u32x4(),
        0b11111111,
    ))
}

/// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtusepi64_epi32&expand=2085)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovusqd))]
pub unsafe fn _mm256_mask_cvtusepi64_epi32(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
    transmute(vpmovusqd256(a.as_u64x4(), src.as_u32x4(), k))
}

/// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtusepi64_epi32&expand=2086)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovusqd))]
pub unsafe fn _mm256_maskz_cvtusepi64_epi32(k: __mmask8, a: __m256i) -> __m128i {
    transmute(vpmovusqd256(
        a.as_u64x4(),
        _mm_setzero_si128().as_u32x4(),
        k,
    ))
}

/// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtusepi64_epi32&expand=2081)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovusqd))]
pub unsafe fn _mm_cvtusepi64_epi32(a: __m128i) -> __m128i {
    transmute(vpmovusqd128(
        a.as_u64x2(),
        _mm_setzero_si128().as_u32x4(),
        0b11111111,
    ))
}

/// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtusepi64_epi32&expand=2082)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovusqd))]
pub unsafe fn _mm_mask_cvtusepi64_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
    transmute(vpmovusqd128(a.as_u64x2(), src.as_u32x4(), k))
}

/// Convert packed unsigned 64-bit integers in a to packed unsigned 32-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtusepi64_epi32&expand=2083)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovusqd))]
pub unsafe fn _mm_maskz_cvtusepi64_epi32(k: __mmask8, a: __m128i) -> __m128i {
    transmute(vpmovusqd128(
        a.as_u64x2(),
        _mm_setzero_si128().as_u32x4(),
        k,
    ))
}

/// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtusepi64_epi16&expand=2078)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovusqw))]
pub unsafe fn _mm512_cvtusepi64_epi16(a: __m512i) -> __m128i {
    transmute(vpmovusqw(
        a.as_u64x8(),
        _mm_setzero_si128().as_u16x8(),
        0b11111111,
    ))
}

/// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtusepi64_epi16&expand=2079)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovusqw))]
pub unsafe fn _mm512_mask_cvtusepi64_epi16(src: __m128i, k: __mmask8, a: __m512i) -> __m128i {
    transmute(vpmovusqw(a.as_u64x8(), src.as_u16x8(), k))
}

/// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtusepi64_epi16&expand=2080)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovusqw))]
pub unsafe fn _mm512_maskz_cvtusepi64_epi16(k: __mmask8, a: __m512i) -> __m128i {
    transmute(vpmovusqw(a.as_u64x8(), _mm_setzero_si128().as_u16x8(), k))
}

/// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtusepi64_epi16&expand=2075)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovusqw))]
pub unsafe fn _mm256_cvtusepi64_epi16(a: __m256i) -> __m128i {
    transmute(vpmovusqw256(
        a.as_u64x4(),
        _mm_setzero_si128().as_u16x8(),
        0b11111111,
    ))
}

/// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtusepi64_epi16&expand=2076)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovusqw))]
pub unsafe fn _mm256_mask_cvtusepi64_epi16(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
    transmute(vpmovusqw256(a.as_u64x4(), src.as_u16x8(), k))
}

/// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtusepi64_epi16&expand=2077)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovusqw))]
pub unsafe fn _mm256_maskz_cvtusepi64_epi16(k: __mmask8, a: __m256i) -> __m128i {
    transmute(vpmovusqw256(
        a.as_u64x4(),
        _mm_setzero_si128().as_u16x8(),
        k,
    ))
}

/// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtusepi64_epi16&expand=2072)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovusqw))]
pub unsafe fn _mm_cvtusepi64_epi16(a: __m128i) -> __m128i {
    transmute(vpmovusqw128(
        a.as_u64x2(),
        _mm_setzero_si128().as_u16x8(),
        0b11111111,
    ))
}

/// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtusepi64_epi16&expand=2073)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovusqw))]
pub unsafe fn _mm_mask_cvtusepi64_epi16(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
    transmute(vpmovusqw128(a.as_u64x2(), src.as_u16x8(), k))
}

/// Convert packed unsigned 64-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtusepi64_epi16&expand=2074)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovusqw))]
pub unsafe fn _mm_maskz_cvtusepi64_epi16(k: __mmask8, a: __m128i) -> __m128i {
    transmute(vpmovusqw128(
        a.as_u64x2(),
        _mm_setzero_si128().as_u16x8(),
        k,
    ))
}

/// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtusepi64_epi8&expand=2096)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovusqb))]
pub unsafe fn _mm512_cvtusepi64_epi8(a: __m512i) -> __m128i {
    transmute(vpmovusqb(
        a.as_u64x8(),
        _mm_setzero_si128().as_u8x16(),
        0b11111111,
    ))
}

/// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtusepi64_epi8&expand=2097)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovusqb))]
pub unsafe fn _mm512_mask_cvtusepi64_epi8(src: __m128i, k: __mmask8, a: __m512i) -> __m128i {
    transmute(vpmovusqb(a.as_u64x8(), src.as_u8x16(), k))
}

/// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtusepi64_epi8&expand=2098)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovusqb))]
pub unsafe fn _mm512_maskz_cvtusepi64_epi8(k: __mmask8, a: __m512i) -> __m128i {
    transmute(vpmovusqb(a.as_u64x8(), _mm_setzero_si128().as_u8x16(), k))
}

/// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtusepi64_epi8&expand=2093)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovusqb))]
pub unsafe fn _mm256_cvtusepi64_epi8(a: __m256i) -> __m128i {
    transmute(vpmovusqb256(
        a.as_u64x4(),
        _mm_setzero_si128().as_u8x16(),
        0b11111111,
    ))
}

/// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtusepi64_epi8&expand=2094)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovusqb))]
pub unsafe fn _mm256_mask_cvtusepi64_epi8(src: __m128i, k: __mmask8, a: __m256i) -> __m128i {
    transmute(vpmovusqb256(a.as_u64x4(), src.as_u8x16(), k))
}

/// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtusepi64_epi8&expand=2095)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovusqb))]
pub unsafe fn _mm256_maskz_cvtusepi64_epi8(k: __mmask8, a: __m256i) -> __m128i {
    transmute(vpmovusqb256(
        a.as_u64x4(),
        _mm_setzero_si128().as_u8x16(),
        k,
    ))
}

/// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtusepi64_epi8&expand=2090)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovusqb))]
pub unsafe fn _mm_cvtusepi64_epi8(a: __m128i) -> __m128i {
    transmute(vpmovusqb128(
        a.as_u64x2(),
        _mm_setzero_si128().as_u8x16(),
        0b11111111,
    ))
}

/// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtusepi64_epi8&expand=2091)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovusqb))]
pub unsafe fn _mm_mask_cvtusepi64_epi8(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
    transmute(vpmovusqb128(a.as_u64x2(), src.as_u8x16(), k))
}

/// Convert packed unsigned 64-bit integers in a to packed unsigned 8-bit integers with unsigned saturation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtusepi64_epi8&expand=2092)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovusqb))]
pub unsafe fn _mm_maskz_cvtusepi64_epi8(k: __mmask8, a: __m128i) -> __m128i {
    transmute(vpmovusqb128(
        a.as_u64x2(),
        _mm_setzero_si128().as_u8x16(),
        k,
    ))
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst.
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///    
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundps_epi32&expand=1335)   
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtps2dq, ROUNDING = 8))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm512_cvt_roundps_epi32<const ROUNDING: i32>(a: __m512) -> __m512i {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x16();
    let zero = _mm512_setzero_si512().as_i32x16();
    let r = vcvtps2dq(a, zero, 0b11111111_11111111, ROUNDING);
    transmute(r)
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///    
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundps_epi32&expand=1336)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtps2dq, ROUNDING = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_mask_cvt_roundps_epi32<const ROUNDING: i32>(
    src: __m512i,
    k: __mmask16,
    a: __m512,
) -> __m512i {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x16();
    let src = src.as_i32x16();
    let r = vcvtps2dq(a, src, k, ROUNDING);
    transmute(r)
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///    
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundps_epi32&expand=1337)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtps2dq, ROUNDING = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_maskz_cvt_roundps_epi32<const ROUNDING: i32>(
    k: __mmask16,
    a: __m512,
) -> __m512i {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x16();
    let zero = _mm512_setzero_si512().as_i32x16();
    let r = vcvtps2dq(a, zero, k, ROUNDING);
    transmute(r)
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///    
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundps_epu32&expand=1341)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtps2udq, ROUNDING = 8))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm512_cvt_roundps_epu32<const ROUNDING: i32>(a: __m512) -> __m512i {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x16();
    let zero = _mm512_setzero_si512().as_u32x16();
    let r = vcvtps2udq(a, zero, 0b11111111_11111111, ROUNDING);
    transmute(r)
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///    
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundps_epu32&expand=1342)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtps2udq, ROUNDING = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_mask_cvt_roundps_epu32<const ROUNDING: i32>(
    src: __m512i,
    k: __mmask16,
    a: __m512,
) -> __m512i {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x16();
    let src = src.as_u32x16();
    let r = vcvtps2udq(a, src, k, ROUNDING);
    transmute(r)
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///    
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundps_epu32&expand=1343)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtps2udq, ROUNDING = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_maskz_cvt_roundps_epu32<const ROUNDING: i32>(
    k: __mmask16,
    a: __m512,
) -> __m512i {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x16();
    let zero = _mm512_setzero_si512().as_u32x16();
    let r = vcvtps2udq(a, zero, k, ROUNDING);
    transmute(r)
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst.\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///    
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundps_pd&expand=1347)   
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtps2pd, SAE = 8))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm512_cvt_roundps_pd<const SAE: i32>(a: __m256) -> __m512d {
    static_assert_sae!(SAE);
    let a = a.as_f32x8();
    let zero = _mm512_setzero_pd().as_f64x8();
    let r = vcvtps2pd(a, zero, 0b11111111, SAE);
    transmute(r)
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundps_epi32&expand=1336)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtps2pd, SAE = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_mask_cvt_roundps_pd<const SAE: i32>(
    src: __m512d,
    k: __mmask8,
    a: __m256,
) -> __m512d {
    static_assert_sae!(SAE);
    let a = a.as_f32x8();
    let src = src.as_f64x8();
    let r = vcvtps2pd(a, src, k, SAE);
    transmute(r)
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed double-precision (64-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundps_epi32&expand=1337)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtps2pd, SAE = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_maskz_cvt_roundps_pd<const SAE: i32>(k: __mmask8, a: __m256) -> __m512d {
    static_assert_sae!(SAE);
    let a = a.as_f32x8();
    let zero = _mm512_setzero_pd().as_f64x8();
    let r = vcvtps2pd(a, zero, k, SAE);
    transmute(r)
}

/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundpd_epi32&expand=1315)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtpd2dq, ROUNDING = 8))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm512_cvt_roundpd_epi32<const ROUNDING: i32>(a: __m512d) -> __m256i {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f64x8();
    let zero = _mm256_setzero_si256().as_i32x8();
    let r = vcvtpd2dq(a, zero, 0b11111111, ROUNDING);
    transmute(r)
}

/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundpd_epi32&expand=1316)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtpd2dq, ROUNDING = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_mask_cvt_roundpd_epi32<const ROUNDING: i32>(
    src: __m256i,
    k: __mmask8,
    a: __m512d,
) -> __m256i {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f64x8();
    let src = src.as_i32x8();
    let r = vcvtpd2dq(a, src, k, ROUNDING);
    transmute(r)
}

/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundpd_epi32&expand=1317)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtpd2dq, ROUNDING = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_maskz_cvt_roundpd_epi32<const ROUNDING: i32>(
    k: __mmask8,
    a: __m512d,
) -> __m256i {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f64x8();
    let zero = _mm256_setzero_si256().as_i32x8();
    let r = vcvtpd2dq(a, zero, k, ROUNDING);
    transmute(r)
}

/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundpd_epu32&expand=1321)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtpd2udq, ROUNDING = 8))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm512_cvt_roundpd_epu32<const ROUNDING: i32>(a: __m512d) -> __m256i {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f64x8();
    let zero = _mm256_setzero_si256().as_u32x8();
    let r = vcvtpd2udq(a, zero, 0b11111111, ROUNDING);
    transmute(r)
}

/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundpd_epu32&expand=1322)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtpd2udq, ROUNDING = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_mask_cvt_roundpd_epu32<const ROUNDING: i32>(
    src: __m256i,
    k: __mmask8,
    a: __m512d,
) -> __m256i {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f64x8();
    let src = src.as_u32x8();
    let r = vcvtpd2udq(a, src, k, ROUNDING);
    transmute(r)
}

/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundpd_epu32&expand=1323)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtpd2udq, ROUNDING = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_maskz_cvt_roundpd_epu32<const ROUNDING: i32>(
    k: __mmask8,
    a: __m512d,
) -> __m256i {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f64x8();
    let zero = _mm256_setzero_si256().as_u32x8();
    let r = vcvtpd2udq(a, zero, k, ROUNDING);
    transmute(r)
}

/// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundpd_ps&expand=1327)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtpd2ps, ROUNDING = 8))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm512_cvt_roundpd_ps<const ROUNDING: i32>(a: __m512d) -> __m256 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f64x8();
    let zero = _mm256_setzero_ps().as_f32x8();
    let r = vcvtpd2ps(a, zero, 0b11111111, ROUNDING);
    transmute(r)
}

/// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundpd_ps&expand=1328)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtpd2ps, ROUNDING = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_mask_cvt_roundpd_ps<const ROUNDING: i32>(
    src: __m256,
    k: __mmask8,
    a: __m512d,
) -> __m256 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f64x8();
    let src = src.as_f32x8();
    let r = vcvtpd2ps(a, src, k, ROUNDING);
    transmute(r)
}

/// Convert packed double-precision (64-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundpd_ps&expand=1329)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtpd2ps, ROUNDING = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_maskz_cvt_roundpd_ps<const ROUNDING: i32>(k: __mmask8, a: __m512d) -> __m256 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f64x8();
    let zero = _mm256_setzero_ps().as_f32x8();
    let r = vcvtpd2ps(a, zero, k, ROUNDING);
    transmute(r)
}

/// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepi32_ps&expand=1294)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtdq2ps, ROUNDING = 8))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm512_cvt_roundepi32_ps<const ROUNDING: i32>(a: __m512i) -> __m512 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_i32x16();
    let r = vcvtdq2ps(a, ROUNDING);
    transmute(r)
}

/// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepi32_ps&expand=1295)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtdq2ps, ROUNDING = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_mask_cvt_roundepi32_ps<const ROUNDING: i32>(
    src: __m512,
    k: __mmask16,
    a: __m512i,
) -> __m512 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_i32x16();
    let r = vcvtdq2ps(a, ROUNDING);
    transmute(simd_select_bitmask(k, r, src.as_f32x16()))
}

/// Convert packed signed 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepi32_ps&expand=1296)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtdq2ps, ROUNDING = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_maskz_cvt_roundepi32_ps<const ROUNDING: i32>(
    k: __mmask16,
    a: __m512i,
) -> __m512 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_i32x16();
    let r = vcvtdq2ps(a, ROUNDING);
    let zero = _mm512_setzero_ps().as_f32x16();
    transmute(simd_select_bitmask(k, r, zero))
}

/// Convert packed unsigned 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundepu32_ps&expand=1303)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtudq2ps, ROUNDING = 8))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm512_cvt_roundepu32_ps<const ROUNDING: i32>(a: __m512i) -> __m512 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_u32x16();
    let r = vcvtudq2ps(a, ROUNDING);
    transmute(r)
}

/// Convert packed unsigned 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundepu32_ps&expand=1304)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtudq2ps, ROUNDING = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_mask_cvt_roundepu32_ps<const ROUNDING: i32>(
    src: __m512,
    k: __mmask16,
    a: __m512i,
) -> __m512 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_u32x16();
    let r = vcvtudq2ps(a, ROUNDING);
    transmute(simd_select_bitmask(k, r, src.as_f32x16()))
}

/// Convert packed unsigned 32-bit integers in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundepu32_ps&expand=1305)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtudq2ps, ROUNDING = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_maskz_cvt_roundepu32_ps<const ROUNDING: i32>(
    k: __mmask16,
    a: __m512i,
) -> __m512 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_u32x16();
    let r = vcvtudq2ps(a, ROUNDING);
    let zero = _mm512_setzero_ps().as_f32x16();
    transmute(simd_select_bitmask(k, r, zero))
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst.\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///    
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundps_ph&expand=1354)   
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtps2ph, SAE = 8))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm512_cvt_roundps_ph<const SAE: i32>(a: __m512) -> __m256i {
    static_assert_sae!(SAE);
    let a = a.as_f32x16();
    let zero = _mm256_setzero_si256().as_i16x16();
    let r = vcvtps2ph(a, SAE, zero, 0b11111111_11111111);
    transmute(r)
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///    
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundps_ph&expand=1355)   
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtps2ph, SAE = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_mask_cvt_roundps_ph<const SAE: i32>(
    src: __m256i,
    k: __mmask16,
    a: __m512,
) -> __m256i {
    static_assert_sae!(SAE);
    let a = a.as_f32x16();
    let src = src.as_i16x16();
    let r = vcvtps2ph(a, SAE, src, k);
    transmute(r)
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///    
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundps_ph&expand=1356)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtps2ph, SAE = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_maskz_cvt_roundps_ph<const SAE: i32>(k: __mmask16, a: __m512) -> __m256i {
    static_assert_sae!(SAE);
    let a = a.as_f32x16();
    let zero = _mm256_setzero_si256().as_i16x16();
    let r = vcvtps2ph(a, SAE, zero, k);
    transmute(r)
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///    
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvt_roundps_ph&expand=1352)   
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvtps2ph, IMM8 = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm256_mask_cvt_roundps_ph<const IMM8: i32>(
    src: __m128i,
    k: __mmask8,
    a: __m256,
) -> __m128i {
    static_assert_imm8!(IMM8);
    let a = a.as_f32x8();
    let src = src.as_i16x8();
    let r = vcvtps2ph256(a, IMM8, src, k);
    transmute(r)
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///    
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvt_roundps_ph&expand=1353)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvtps2ph, IMM8 = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm256_maskz_cvt_roundps_ph<const IMM8: i32>(k: __mmask8, a: __m256) -> __m128i {
    static_assert_imm8!(IMM8);
    let a = a.as_f32x8();
    let zero = _mm_setzero_si128().as_i16x8();
    let r = vcvtps2ph256(a, IMM8, zero, k);
    transmute(r)
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///    
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvt_roundps_ph&expand=1350)   
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvtps2ph, IMM8 = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm_mask_cvt_roundps_ph<const IMM8: i32>(
    src: __m128i,
    k: __mmask8,
    a: __m128,
) -> __m128i {
    static_assert_imm8!(IMM8);
    let a = a.as_f32x4();
    let src = src.as_i16x8();
    let r = vcvtps2ph128(a, IMM8, src, k);
    transmute(r)
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///    
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvt_roundps_ph&expand=1351)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvtps2ph, IMM8 = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm_maskz_cvt_roundps_ph<const IMM8: i32>(k: __mmask8, a: __m128) -> __m128i {
    static_assert_imm8!(IMM8);
    let a = a.as_f32x4();
    let zero = _mm_setzero_si128().as_i16x8();
    let r = vcvtps2ph128(a, IMM8, zero, k);
    transmute(r)
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst.\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///    
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtps_ph&expand=1778)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtps2ph, SAE = 8))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm512_cvtps_ph<const SAE: i32>(a: __m512) -> __m256i {
    static_assert_sae!(SAE);
    let a = a.as_f32x16();
    let zero = _mm256_setzero_si256().as_i16x16();
    let r = vcvtps2ph(a, SAE, zero, 0b11111111_11111111);
    transmute(r)
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///    
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtps_ph&expand=1779)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtps2ph, SAE = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_mask_cvtps_ph<const SAE: i32>(
    src: __m256i,
    k: __mmask16,
    a: __m512,
) -> __m256i {
    static_assert_sae!(SAE);
    let a = a.as_f32x16();
    let src = src.as_i16x16();
    let r = vcvtps2ph(a, SAE, src, k);
    transmute(r)
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///    
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtps_ph&expand=1780)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtps2ph, SAE = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_maskz_cvtps_ph<const SAE: i32>(k: __mmask16, a: __m512) -> __m256i {
    static_assert_sae!(SAE);
    let a = a.as_f32x16();
    let zero = _mm256_setzero_si256().as_i16x16();
    let r = vcvtps2ph(a, SAE, zero, k);
    transmute(r)
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
///    _MM_FROUND_TO_NEG_INF     // round down\
///    _MM_FROUND_TO_POS_INF     // round up\
///    _MM_FROUND_TO_ZERO        // truncate\
///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///    
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtps_ph&expand=1776)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvtps2ph, IMM8 = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm256_mask_cvtps_ph<const IMM8: i32>(
    src: __m128i,
    k: __mmask8,
    a: __m256,
) -> __m128i {
    static_assert_imm8!(IMM8);
    let a = a.as_f32x8();
    let src = src.as_i16x8();
    let r = vcvtps2ph256(a, IMM8, src, k);
    transmute(r)
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
///    _MM_FROUND_TO_NEG_INF     // round down\
///    _MM_FROUND_TO_POS_INF     // round up\
///    _MM_FROUND_TO_ZERO        // truncate\
///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///    
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtps_ph&expand=1777)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvtps2ph, IMM8 = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm256_maskz_cvtps_ph<const IMM8: i32>(k: __mmask8, a: __m256) -> __m128i {
    static_assert_imm8!(IMM8);
    let a = a.as_f32x8();
    let zero = _mm_setzero_si128().as_i16x8();
    let r = vcvtps2ph256(a, IMM8, zero, k);
    transmute(r)
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
///    _MM_FROUND_TO_NEG_INF     // round down\
///    _MM_FROUND_TO_POS_INF     // round up\
///    _MM_FROUND_TO_ZERO        // truncate\
///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///    
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtps_ph&expand=1773)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvtps2ph, IMM8 = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm_mask_cvtps_ph<const IMM8: i32>(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
    static_assert_imm8!(IMM8);
    let a = a.as_f32x4();
    let src = src.as_i16x8();
    let r = vcvtps2ph128(a, IMM8, src, k);
    transmute(r)
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed half-precision (16-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
///    _MM_FROUND_TO_NEG_INF     // round down\
///    _MM_FROUND_TO_POS_INF     // round up\
///    _MM_FROUND_TO_ZERO        // truncate\
///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///    
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtps_ph&expand=1774)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvtps2ph, IMM8 = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm_maskz_cvtps_ph<const IMM8: i32>(k: __mmask8, a: __m128) -> __m128i {
    static_assert_imm8!(IMM8);
    let a = a.as_f32x4();
    let zero = _mm_setzero_si128().as_i16x8();
    let r = vcvtps2ph128(a, IMM8, zero, k);
    transmute(r)
}

/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///    
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvt_roundph_ps&expand=1332)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtph2ps, SAE = 8))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm512_cvt_roundph_ps<const SAE: i32>(a: __m256i) -> __m512 {
    static_assert_sae!(SAE);
    let a = a.as_i16x16();
    let zero = _mm512_setzero_ps().as_f32x16();
    let r = vcvtph2ps(a, zero, 0b11111111_11111111, SAE);
    transmute(r)
}

/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///    
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvt_roundph_ps&expand=1333)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtph2ps, SAE = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_mask_cvt_roundph_ps<const SAE: i32>(
    src: __m512,
    k: __mmask16,
    a: __m256i,
) -> __m512 {
    static_assert_sae!(SAE);
    let a = a.as_i16x16();
    let src = src.as_f32x16();
    let r = vcvtph2ps(a, src, k, SAE);
    transmute(r)
}

/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///    
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvt_roundph_ps&expand=1334)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtph2ps, SAE = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_maskz_cvt_roundph_ps<const SAE: i32>(k: __mmask16, a: __m256i) -> __m512 {
    static_assert_sae!(SAE);
    let a = a.as_i16x16();
    let zero = _mm512_setzero_ps().as_f32x16();
    let r = vcvtph2ps(a, zero, k, SAE);
    transmute(r)
}

/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtph_ps&expand=1723)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtph2ps))]
pub unsafe fn _mm512_cvtph_ps(a: __m256i) -> __m512 {
    transmute(vcvtph2ps(
        a.as_i16x16(),
        _mm512_setzero_ps().as_f32x16(),
        0b11111111_11111111,
        _MM_FROUND_NO_EXC,
    ))
}

/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtph_ps&expand=1724)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtph2ps))]
pub unsafe fn _mm512_mask_cvtph_ps(src: __m512, k: __mmask16, a: __m256i) -> __m512 {
    transmute(vcvtph2ps(
        a.as_i16x16(),
        src.as_f32x16(),
        k,
        _MM_FROUND_NO_EXC,
    ))
}

/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtph_ps&expand=1725)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtph2ps))]
pub unsafe fn _mm512_maskz_cvtph_ps(k: __mmask16, a: __m256i) -> __m512 {
    transmute(vcvtph2ps(
        a.as_i16x16(),
        _mm512_setzero_ps().as_f32x16(),
        k,
        _MM_FROUND_NO_EXC,
    ))
}

/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtph_ps&expand=1721)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvtph2ps))]
pub unsafe fn _mm256_mask_cvtph_ps(src: __m256, k: __mmask8, a: __m128i) -> __m256 {
    let convert = _mm256_cvtph_ps(a);
    transmute(simd_select_bitmask(k, convert.as_f32x8(), src.as_f32x8()))
}

/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvtph_ps&expand=1722)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvtph2ps))]
pub unsafe fn _mm256_maskz_cvtph_ps(k: __mmask8, a: __m128i) -> __m256 {
    let convert = _mm256_cvtph_ps(a);
    let zero = _mm256_setzero_ps().as_f32x8();
    transmute(simd_select_bitmask(k, convert.as_f32x8(), zero))
}

/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtph_ps&expand=1718)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvtph2ps))]
pub unsafe fn _mm_mask_cvtph_ps(src: __m128, k: __mmask8, a: __m128i) -> __m128 {
    let convert = _mm_cvtph_ps(a);
    transmute(simd_select_bitmask(k, convert.as_f32x4(), src.as_f32x4()))
}

/// Convert packed half-precision (16-bit) floating-point elements in a to packed single-precision (32-bit) floating-point elements, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvtph_ps&expand=1719)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvtph2ps))]
pub unsafe fn _mm_maskz_cvtph_ps(k: __mmask8, a: __m128i) -> __m128 {
    let convert = _mm_cvtph_ps(a);
    let zero = _mm_setzero_ps().as_f32x4();
    transmute(simd_select_bitmask(k, convert.as_f32x4(), zero))
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst.\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///    
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundps_epi32&expand=1916)   
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvttps2dq, SAE = 8))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm512_cvtt_roundps_epi32<const SAE: i32>(a: __m512) -> __m512i {
    static_assert_sae!(SAE);
    let a = a.as_f32x16();
    let zero = _mm512_setzero_si512().as_i32x16();
    let r = vcvttps2dq(a, zero, 0b11111111_11111111, SAE);
    transmute(r)
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundps_epi32&expand=1917)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvttps2dq, SAE = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_mask_cvtt_roundps_epi32<const SAE: i32>(
    src: __m512i,
    k: __mmask16,
    a: __m512,
) -> __m512i {
    static_assert_sae!(SAE);
    let a = a.as_f32x16();
    let src = src.as_i32x16();
    let r = vcvttps2dq(a, src, k, SAE);
    transmute(r)
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundps_epi32&expand=1918)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvttps2dq, SAE = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_maskz_cvtt_roundps_epi32<const SAE: i32>(k: __mmask16, a: __m512) -> __m512i {
    static_assert_sae!(SAE);
    let a = a.as_f32x16();
    let zero = _mm512_setzero_si512().as_i32x16();
    let r = vcvttps2dq(a, zero, k, SAE);
    transmute(r)
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///    
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundps_epu32&expand=1922)   
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvttps2udq, SAE = 8))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm512_cvtt_roundps_epu32<const SAE: i32>(a: __m512) -> __m512i {
    static_assert_sae!(SAE);
    let a = a.as_f32x16();
    let zero = _mm512_setzero_si512().as_u32x16();
    let r = vcvttps2udq(a, zero, 0b11111111_11111111, SAE);
    transmute(r)
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundps_epu32&expand=1923)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvttps2udq, SAE = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_mask_cvtt_roundps_epu32<const SAE: i32>(
    src: __m512i,
    k: __mmask16,
    a: __m512,
) -> __m512i {
    static_assert_sae!(SAE);
    let a = a.as_f32x16();
    let src = src.as_u32x16();
    let r = vcvttps2udq(a, src, k, SAE);
    transmute(r)
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundps_epu32&expand=1924)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvttps2udq, SAE = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_maskz_cvtt_roundps_epu32<const SAE: i32>(k: __mmask16, a: __m512) -> __m512i {
    static_assert_sae!(SAE);
    let a = a.as_f32x16();
    let zero = _mm512_setzero_si512().as_u32x16();
    let r = vcvttps2udq(a, zero, k, SAE);
    transmute(r)
}

/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst.\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///    
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundpd_epi32&expand=1904)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvttpd2dq, SAE = 8))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm512_cvtt_roundpd_epi32<const SAE: i32>(a: __m512d) -> __m256i {
    static_assert_sae!(SAE);
    let a = a.as_f64x8();
    let zero = _mm256_setzero_si256().as_i32x8();
    let r = vcvttpd2dq(a, zero, 0b11111111, SAE);
    transmute(r)
}

/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundpd_epi32&expand=1905)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvttpd2dq, SAE = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_mask_cvtt_roundpd_epi32<const SAE: i32>(
    src: __m256i,
    k: __mmask8,
    a: __m512d,
) -> __m256i {
    static_assert_sae!(SAE);
    let a = a.as_f64x8();
    let src = src.as_i32x8();
    let r = vcvttpd2dq(a, src, k, SAE);
    transmute(r)
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundps_epi32&expand=1918)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvttpd2dq, SAE = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_maskz_cvtt_roundpd_epi32<const SAE: i32>(k: __mmask8, a: __m512d) -> __m256i {
    static_assert_sae!(SAE);
    let a = a.as_f64x8();
    let zero = _mm256_setzero_si256().as_i32x8();
    let r = vcvttpd2dq(a, zero, k, SAE);
    transmute(r)
}

/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///    
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtt_roundpd_epu32&expand=1910)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvttpd2udq, SAE = 8))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm512_cvtt_roundpd_epu32<const SAE: i32>(a: __m512d) -> __m256i {
    static_assert_sae!(SAE);
    let a = a.as_f64x8();
    let zero = _mm256_setzero_si256().as_i32x8();
    let r = vcvttpd2udq(a, zero, 0b11111111, SAE);
    transmute(r)
}

/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtt_roundpd_epu32&expand=1911)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvttpd2udq, SAE = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_mask_cvtt_roundpd_epu32<const SAE: i32>(
    src: __m256i,
    k: __mmask8,
    a: __m512d,
) -> __m256i {
    static_assert_sae!(SAE);
    let a = a.as_f64x8();
    let src = src.as_i32x8();
    let r = vcvttpd2udq(a, src, k, SAE);
    transmute(r)
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst.    
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttps_epi32&expand=1984)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvttps2dq))]
pub unsafe fn _mm512_cvttps_epi32(a: __m512) -> __m512i {
    transmute(vcvttps2dq(
        a.as_f32x16(),
        _mm512_setzero_si512().as_i32x16(),
        0b11111111_11111111,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttps_epi32&expand=1985)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvttps2dq))]
pub unsafe fn _mm512_mask_cvttps_epi32(src: __m512i, k: __mmask16, a: __m512) -> __m512i {
    transmute(vcvttps2dq(
        a.as_f32x16(),
        src.as_i32x16(),
        k,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttps_epi32&expand=1986)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvttps2dq))]
pub unsafe fn _mm512_maskz_cvttps_epi32(k: __mmask16, a: __m512) -> __m512i {
    transmute(vcvttps2dq(
        a.as_f32x16(),
        _mm512_setzero_si512().as_i32x16(),
        k,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttps_epi32&expand=1982)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvttps2dq))]
pub unsafe fn _mm256_mask_cvttps_epi32(src: __m256i, k: __mmask8, a: __m256) -> __m256i {
    transmute(vcvttps2dq256(a.as_f32x8(), src.as_i32x8(), k))
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttps_epi32&expand=1983)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvttps2dq))]
pub unsafe fn _mm256_maskz_cvttps_epi32(k: __mmask8, a: __m256) -> __m256i {
    transmute(vcvttps2dq256(
        a.as_f32x8(),
        _mm256_setzero_si256().as_i32x8(),
        k,
    ))
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttps_epi32&expand=1979)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvttps2dq))]
pub unsafe fn _mm_mask_cvttps_epi32(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
    transmute(vcvttps2dq128(a.as_f32x4(), src.as_i32x4(), k))
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttps_epi32&expand=1980)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvttps2dq))]
pub unsafe fn _mm_maskz_cvttps_epi32(k: __mmask8, a: __m128) -> __m128i {
    transmute(vcvttps2dq128(
        a.as_f32x4(),
        _mm_setzero_si128().as_i32x4(),
        k,
    ))
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.    
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttps_epu32&expand=2002)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvttps2udq))]
pub unsafe fn _mm512_cvttps_epu32(a: __m512) -> __m512i {
    transmute(vcvttps2udq(
        a.as_f32x16(),
        _mm512_setzero_si512().as_u32x16(),
        0b11111111_11111111,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Convert packed double-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttps_epu32&expand=2003)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvttps2udq))]
pub unsafe fn _mm512_mask_cvttps_epu32(src: __m512i, k: __mmask16, a: __m512) -> __m512i {
    transmute(vcvttps2udq(
        a.as_f32x16(),
        src.as_u32x16(),
        k,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Convert packed double-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttps_epu32&expand=2004)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvttps2udq))]
pub unsafe fn _mm512_maskz_cvttps_epu32(k: __mmask16, a: __m512) -> __m512i {
    transmute(vcvttps2udq(
        a.as_f32x16(),
        _mm512_setzero_si512().as_u32x16(),
        k,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.    
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttps_epu32&expand=1999)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvttps2udq))]
pub unsafe fn _mm256_cvttps_epu32(a: __m256) -> __m256i {
    transmute(vcvttps2udq256(
        a.as_f32x8(),
        _mm256_setzero_si256().as_u32x8(),
        0b11111111,
    ))
}

/// Convert packed double-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttps_epu32&expand=2000)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvttps2udq))]
pub unsafe fn _mm256_mask_cvttps_epu32(src: __m256i, k: __mmask8, a: __m256) -> __m256i {
    transmute(vcvttps2udq256(a.as_f32x8(), src.as_u32x8(), k))
}

/// Convert packed double-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttps_epu32&expand=2001)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvttps2udq))]
pub unsafe fn _mm256_maskz_cvttps_epu32(k: __mmask8, a: __m256) -> __m256i {
    transmute(vcvttps2udq256(
        a.as_f32x8(),
        _mm256_setzero_si256().as_u32x8(),
        k,
    ))
}

/// Convert packed single-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.    
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttps_epu32&expand=1996)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvttps2udq))]
pub unsafe fn _mm_cvttps_epu32(a: __m128) -> __m128i {
    transmute(vcvttps2udq128(
        a.as_f32x4(),
        _mm_setzero_si128().as_u32x4(),
        0b11111111,
    ))
}

/// Convert packed double-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttps_epu32&expand=1997)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvttps2udq))]
pub unsafe fn _mm_mask_cvttps_epu32(src: __m128i, k: __mmask8, a: __m128) -> __m128i {
    transmute(vcvttps2udq128(a.as_f32x4(), src.as_u32x4(), k))
}

/// Convert packed double-precision (32-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttps_epu32&expand=1998)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvttps2udq))]
pub unsafe fn _mm_maskz_cvttps_epu32(k: __mmask8, a: __m128) -> __m128i {
    transmute(vcvttps2udq128(
        a.as_f32x4(),
        _mm_setzero_si128().as_u32x4(),
        k,
    ))
}

/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvtt_roundpd_epu32&expand=1912)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvttpd2udq, SAE = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_maskz_cvtt_roundpd_epu32<const SAE: i32>(k: __mmask8, a: __m512d) -> __m256i {
    static_assert_sae!(SAE);
    let a = a.as_f64x8();
    let zero = _mm256_setzero_si256().as_i32x8();
    let r = vcvttpd2udq(a, zero, k, SAE);
    transmute(r)
}

/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst.  
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttpd_epi32&expand=1947)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvttpd2dq))]
pub unsafe fn _mm512_cvttpd_epi32(a: __m512d) -> __m256i {
    transmute(vcvttpd2dq(
        a.as_f64x8(),
        _mm256_setzero_si256().as_i32x8(),
        0b11111111,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttpd_epi32&expand=1948)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvttpd2dq))]
pub unsafe fn _mm512_mask_cvttpd_epi32(src: __m256i, k: __mmask8, a: __m512d) -> __m256i {
    transmute(vcvttpd2dq(
        a.as_f64x8(),
        src.as_i32x8(),
        k,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttpd_epi32&expand=1949)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvttpd2dq))]
pub unsafe fn _mm512_maskz_cvttpd_epi32(k: __mmask8, a: __m512d) -> __m256i {
    transmute(vcvttpd2dq(
        a.as_f64x8(),
        _mm256_setzero_si256().as_i32x8(),
        k,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttpd_epi32&expand=1945)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvttpd2dq))]
pub unsafe fn _mm256_mask_cvttpd_epi32(src: __m128i, k: __mmask8, a: __m256d) -> __m128i {
    transmute(vcvttpd2dq256(a.as_f64x4(), src.as_i32x4(), k))
}

/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttpd_epi32&expand=1946)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvttpd2dq))]
pub unsafe fn _mm256_maskz_cvttpd_epi32(k: __mmask8, a: __m256d) -> __m128i {
    transmute(vcvttpd2dq256(
        a.as_f64x4(),
        _mm_setzero_si128().as_i32x4(),
        k,
    ))
}

/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttpd_epi32&expand=1942)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvttpd2dq))]
pub unsafe fn _mm_mask_cvttpd_epi32(src: __m128i, k: __mmask8, a: __m128d) -> __m128i {
    transmute(vcvttpd2dq128(a.as_f64x2(), src.as_i32x4(), k))
}

/// Convert packed double-precision (64-bit) floating-point elements in a to packed 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttpd_epi32&expand=1943)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvttpd2dq))]
pub unsafe fn _mm_maskz_cvttpd_epi32(k: __mmask8, a: __m128d) -> __m128i {
    transmute(vcvttpd2dq128(
        a.as_f64x2(),
        _mm_setzero_si128().as_i32x4(),
        k,
    ))
}

/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.    
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvttpd_epu32&expand=1965)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvttpd2udq))]
pub unsafe fn _mm512_cvttpd_epu32(a: __m512d) -> __m256i {
    transmute(vcvttpd2udq(
        a.as_f64x8(),
        _mm256_setzero_si256().as_i32x8(),
        0b11111111,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvttpd_epu32&expand=1966)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvttpd2udq))]
pub unsafe fn _mm512_mask_cvttpd_epu32(src: __m256i, k: __mmask8, a: __m512d) -> __m256i {
    transmute(vcvttpd2udq(
        a.as_f64x8(),
        src.as_i32x8(),
        k,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_cvttpd_epu32&expand=1967)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvttpd2udq))]
pub unsafe fn _mm512_maskz_cvttpd_epu32(k: __mmask8, a: __m512d) -> __m256i {
    transmute(vcvttpd2udq(
        a.as_f64x8(),
        _mm256_setzero_si256().as_i32x8(),
        k,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.    
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvttpd_epu32&expand=1962)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvttpd2udq))]
pub unsafe fn _mm256_cvttpd_epu32(a: __m256d) -> __m128i {
    transmute(vcvttpd2udq256(
        a.as_f64x4(),
        _mm_setzero_si128().as_i32x4(),
        0b11111111,
    ))
}

/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvttpd_epu32&expand=1963)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvttpd2udq))]
pub unsafe fn _mm256_mask_cvttpd_epu32(src: __m128i, k: __mmask8, a: __m256d) -> __m128i {
    transmute(vcvttpd2udq256(a.as_f64x4(), src.as_i32x4(), k))
}

/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_cvttpd_epu32&expand=1964)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvttpd2udq))]
pub unsafe fn _mm256_maskz_cvttpd_epu32(k: __mmask8, a: __m256d) -> __m128i {
    transmute(vcvttpd2udq256(
        a.as_f64x4(),
        _mm_setzero_si128().as_i32x4(),
        k,
    ))
}

/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst.    
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttpd_epu32&expand=1959)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvttpd2udq))]
pub unsafe fn _mm_cvttpd_epu32(a: __m128d) -> __m128i {
    transmute(vcvttpd2udq128(
        a.as_f64x2(),
        _mm_setzero_si128().as_i32x4(),
        0b11111111,
    ))
}

/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvttpd_epu32&expand=1960)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvttpd2udq))]
pub unsafe fn _mm_mask_cvttpd_epu32(src: __m128i, k: __mmask8, a: __m128d) -> __m128i {
    transmute(vcvttpd2udq128(a.as_f64x2(), src.as_i32x4(), k))
}

/// Convert packed double-precision (64-bit) floating-point elements in a to packed unsigned 32-bit integers with truncation, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_cvttpd_epu32&expand=1961)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcvttpd2udq))]
pub unsafe fn _mm_maskz_cvttpd_epu32(k: __mmask8, a: __m128d) -> __m128i {
    transmute(vcvttpd2udq128(
        a.as_f64x2(),
        _mm_setzero_si128().as_i32x4(),
        k,
    ))
}

/// Returns vector of type `__m512d` with all elements set to zero.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setzero_pd&expand=5018)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vxorps))]
pub unsafe fn _mm512_setzero_pd() -> __m512d {
    // All-0 is a properly initialized __m512d
    mem::zeroed()
}

/// Returns vector of type `__m512d` with all elements set to zero.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setzero_ps&expand=5021)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vxorps))]
pub unsafe fn _mm512_setzero_ps() -> __m512 {
    // All-0 is a properly initialized __m512
    mem::zeroed()
}

/// Return vector of type __m512 with all elements set to zero.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setzero&expand=5014)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vxorps))]
pub unsafe fn _mm512_setzero() -> __m512 {
    // All-0 is a properly initialized __m512
    mem::zeroed()
}

/// Returns vector of type `__m512i` with all elements set to zero.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setzero_si512&expand=5024)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vxorps))]
pub unsafe fn _mm512_setzero_si512() -> __m512i {
    // All-0 is a properly initialized __m512i
    mem::zeroed()
}

/// Return vector of type __m512i with all elements set to zero.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setzero_epi32&expand=5015)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vxorps))]
pub unsafe fn _mm512_setzero_epi32() -> __m512i {
    // All-0 is a properly initialized __m512i
    mem::zeroed()
}

/// Sets packed 32-bit integers in `dst` with the supplied values in reverse
/// order.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setr_epi32&expand=4991)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_setr_epi32(
    e15: i32,
    e14: i32,
    e13: i32,
    e12: i32,
    e11: i32,
    e10: i32,
    e9: i32,
    e8: i32,
    e7: i32,
    e6: i32,
    e5: i32,
    e4: i32,
    e3: i32,
    e2: i32,
    e1: i32,
    e0: i32,
) -> __m512i {
    let r = i32x16(
        e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0,
    );
    transmute(r)
}

/// Set packed 8-bit integers in dst with the supplied values.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set_epi8&expand=4915)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_set_epi8(
    e63: i8,
    e62: i8,
    e61: i8,
    e60: i8,
    e59: i8,
    e58: i8,
    e57: i8,
    e56: i8,
    e55: i8,
    e54: i8,
    e53: i8,
    e52: i8,
    e51: i8,
    e50: i8,
    e49: i8,
    e48: i8,
    e47: i8,
    e46: i8,
    e45: i8,
    e44: i8,
    e43: i8,
    e42: i8,
    e41: i8,
    e40: i8,
    e39: i8,
    e38: i8,
    e37: i8,
    e36: i8,
    e35: i8,
    e34: i8,
    e33: i8,
    e32: i8,
    e31: i8,
    e30: i8,
    e29: i8,
    e28: i8,
    e27: i8,
    e26: i8,
    e25: i8,
    e24: i8,
    e23: i8,
    e22: i8,
    e21: i8,
    e20: i8,
    e19: i8,
    e18: i8,
    e17: i8,
    e16: i8,
    e15: i8,
    e14: i8,
    e13: i8,
    e12: i8,
    e11: i8,
    e10: i8,
    e9: i8,
    e8: i8,
    e7: i8,
    e6: i8,
    e5: i8,
    e4: i8,
    e3: i8,
    e2: i8,
    e1: i8,
    e0: i8,
) -> __m512i {
    let r = i8x64(
        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, e16, e17, e18, e19,
        e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31, e32, e33, e34, e35, e36, e37,
        e38, e39, e40, e41, e42, e43, e44, e45, e46, e47, e48, e49, e50, e51, e52, e53, e54, e55,
        e56, e57, e58, e59, e60, e61, e62, e63,
    );
    transmute(r)
}

/// Set packed 16-bit integers in dst with the supplied values.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set_epi16&expand=4905)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_set_epi16(
    e31: i16,
    e30: i16,
    e29: i16,
    e28: i16,
    e27: i16,
    e26: i16,
    e25: i16,
    e24: i16,
    e23: i16,
    e22: i16,
    e21: i16,
    e20: i16,
    e19: i16,
    e18: i16,
    e17: i16,
    e16: i16,
    e15: i16,
    e14: i16,
    e13: i16,
    e12: i16,
    e11: i16,
    e10: i16,
    e9: i16,
    e8: i16,
    e7: i16,
    e6: i16,
    e5: i16,
    e4: i16,
    e3: i16,
    e2: i16,
    e1: i16,
    e0: i16,
) -> __m512i {
    let r = i16x32(
        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, e16, e17, e18, e19,
        e20, e21, e22, e23, e24, e25, e26, e27, e28, e29, e30, e31,
    );
    transmute(r)
}

/// Set packed 32-bit integers in dst with the repeated 4 element sequence.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set4_epi32&expand=4982)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_set4_epi32(d: i32, c: i32, b: i32, a: i32) -> __m512i {
    _mm512_set_epi32(d, c, b, a, d, c, b, a, d, c, b, a, d, c, b, a)
}

/// Set packed single-precision (32-bit) floating-point elements in dst with the repeated 4 element sequence.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set4_ps&expand=4985)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_set4_ps(d: f32, c: f32, b: f32, a: f32) -> __m512 {
    _mm512_set_ps(d, c, b, a, d, c, b, a, d, c, b, a, d, c, b, a)
}

/// Set packed double-precision (64-bit) floating-point elements in dst with the repeated 4 element sequence.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set4_pd&expand=4984)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_set4_pd(d: f64, c: f64, b: f64, a: f64) -> __m512d {
    _mm512_set_pd(d, c, b, a, d, c, b, a)
}

/// Set packed 32-bit integers in dst with the repeated 4 element sequence in reverse order.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setr4_epi32&expand=5009)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_setr4_epi32(d: i32, c: i32, b: i32, a: i32) -> __m512i {
    _mm512_set_epi32(a, b, c, d, a, b, c, d, a, b, c, d, a, b, c, d)
}

/// Set packed single-precision (32-bit) floating-point elements in dst with the repeated 4 element sequence in reverse order.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setr4_ps&expand=5012)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_setr4_ps(d: f32, c: f32, b: f32, a: f32) -> __m512 {
    _mm512_set_ps(a, b, c, d, a, b, c, d, a, b, c, d, a, b, c, d)
}

/// Set packed double-precision (64-bit) floating-point elements in dst with the repeated 4 element sequence in reverse order.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setr4_pd&expand=5011)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_setr4_pd(d: f64, c: f64, b: f64, a: f64) -> __m512d {
    _mm512_set_pd(a, b, c, d, a, b, c, d)
}

/// Set packed 64-bit integers in dst with the supplied values.
///
/// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set_epi64&expand=4910)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_set_epi64(
    e0: i64,
    e1: i64,
    e2: i64,
    e3: i64,
    e4: i64,
    e5: i64,
    e6: i64,
    e7: i64,
) -> __m512i {
    _mm512_setr_epi64(e7, e6, e5, e4, e3, e2, e1, e0)
}

/// Set packed 64-bit integers in dst with the supplied values in reverse order.
///
/// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setr_epi64&expand=4993)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_setr_epi64(
    e0: i64,
    e1: i64,
    e2: i64,
    e3: i64,
    e4: i64,
    e5: i64,
    e6: i64,
    e7: i64,
) -> __m512i {
    let r = i64x8::new(e0, e1, e2, e3, e4, e5, e6, e7);
    transmute(r)
}

/// Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32gather_pd&expand=3002)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_i32gather_pd<const SCALE: i32>(offsets: __m256i, slice: *const u8) -> __m512d {
    static_assert_imm8_scale!(SCALE);
    let zero = _mm512_setzero_pd().as_f64x8();
    let neg_one = -1;
    let slice = slice as *const i8;
    let offsets = offsets.as_i32x8();
    let r = vgatherdpd(zero, slice, offsets, neg_one, SCALE);
    transmute(r)
}

/// Gather double-precision (64-bit) floating-point elements from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32gather_pd&expand=3003)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask_i32gather_pd<const SCALE: i32>(
    src: __m512d,
    mask: __mmask8,
    offsets: __m256i,
    slice: *const u8,
) -> __m512d {
    static_assert_imm8_scale!(SCALE);
    let src = src.as_f64x8();
    let slice = slice as *const i8;
    let offsets = offsets.as_i32x8();
    let r = vgatherdpd(src, slice, offsets, mask as i8, SCALE);
    transmute(r)
}

/// Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64gather_pd&expand=3092)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_i64gather_pd<const SCALE: i32>(offsets: __m512i, slice: *const u8) -> __m512d {
    static_assert_imm8_scale!(SCALE);
    let zero = _mm512_setzero_pd().as_f64x8();
    let neg_one = -1;
    let slice = slice as *const i8;
    let offsets = offsets.as_i64x8();
    let r = vgatherqpd(zero, slice, offsets, neg_one, SCALE);
    transmute(r)
}

/// Gather double-precision (64-bit) floating-point elements from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64gather_pd&expand=3093)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask_i64gather_pd<const SCALE: i32>(
    src: __m512d,
    mask: __mmask8,
    offsets: __m512i,
    slice: *const u8,
) -> __m512d {
    static_assert_imm8_scale!(SCALE);
    let src = src.as_f64x8();
    let slice = slice as *const i8;
    let offsets = offsets.as_i64x8();
    let r = vgatherqpd(src, slice, offsets, mask as i8, SCALE);
    transmute(r)
}

/// Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64gather_ps&expand=3100)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_i64gather_ps<const SCALE: i32>(offsets: __m512i, slice: *const u8) -> __m256 {
    static_assert_imm8_scale!(SCALE);
    let zero = _mm256_setzero_ps().as_f32x8();
    let neg_one = -1;
    let slice = slice as *const i8;
    let offsets = offsets.as_i64x8();
    let r = vgatherqps(zero, slice, offsets, neg_one, SCALE);
    transmute(r)
}

/// Gather single-precision (32-bit) floating-point elements from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64gather_ps&expand=3101)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask_i64gather_ps<const SCALE: i32>(
    src: __m256,
    mask: __mmask8,
    offsets: __m512i,
    slice: *const u8,
) -> __m256 {
    static_assert_imm8_scale!(SCALE);
    let src = src.as_f32x8();
    let slice = slice as *const i8;
    let offsets = offsets.as_i64x8();
    let r = vgatherqps(src, slice, offsets, mask as i8, SCALE);
    transmute(r)
}

/// Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32gather_ps&expand=3010)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_i32gather_ps<const SCALE: i32>(offsets: __m512i, slice: *const u8) -> __m512 {
    static_assert_imm8_scale!(SCALE);
    let zero = _mm512_setzero_ps().as_f32x16();
    let neg_one = -1;
    let slice = slice as *const i8;
    let offsets = offsets.as_i32x16();
    let r = vgatherdps(zero, slice, offsets, neg_one, SCALE);
    transmute(r)
}

/// Gather single-precision (32-bit) floating-point elements from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32gather_ps&expand=3011)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask_i32gather_ps<const SCALE: i32>(
    src: __m512,
    mask: __mmask16,
    offsets: __m512i,
    slice: *const u8,
) -> __m512 {
    static_assert_imm8_scale!(SCALE);
    let src = src.as_f32x16();
    let slice = slice as *const i8;
    let offsets = offsets.as_i32x16();
    let r = vgatherdps(src, slice, offsets, mask as i16, SCALE);
    transmute(r)
}

/// Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32gather_epi32&expand=2986)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_i32gather_epi32<const SCALE: i32>(
    offsets: __m512i,
    slice: *const u8,
) -> __m512i {
    static_assert_imm8_scale!(SCALE);
    let zero = _mm512_setzero_si512().as_i32x16();
    let neg_one = -1;
    let slice = slice as *const i8;
    let offsets = offsets.as_i32x16();
    let r = vpgatherdd(zero, slice, offsets, neg_one, SCALE);
    transmute(r)
}

/// Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32gather_epi32&expand=2987)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask_i32gather_epi32<const SCALE: i32>(
    src: __m512i,
    mask: __mmask16,
    offsets: __m512i,
    slice: *const u8,
) -> __m512i {
    static_assert_imm8_scale!(SCALE);
    let src = src.as_i32x16();
    let mask = mask as i16;
    let slice = slice as *const i8;
    let offsets = offsets.as_i32x16();
    let r = vpgatherdd(src, slice, offsets, mask, SCALE);
    transmute(r)
}

/// Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32gather_epi64&expand=2994)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_i32gather_epi64<const SCALE: i32>(
    offsets: __m256i,
    slice: *const u8,
) -> __m512i {
    static_assert_imm8_scale!(SCALE);
    let zero = _mm512_setzero_si512().as_i64x8();
    let neg_one = -1;
    let slice = slice as *const i8;
    let offsets = offsets.as_i32x8();
    let r = vpgatherdq(zero, slice, offsets, neg_one, SCALE);
    transmute(r)
}

/// Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32gather_epi64&expand=2995)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask_i32gather_epi64<const SCALE: i32>(
    src: __m512i,
    mask: __mmask8,
    offsets: __m256i,
    slice: *const u8,
) -> __m512i {
    static_assert_imm8_scale!(SCALE);
    let src = src.as_i64x8();
    let mask = mask as i8;
    let slice = slice as *const i8;
    let offsets = offsets.as_i32x8();
    let r = vpgatherdq(src, slice, offsets, mask, SCALE);
    transmute(r)
}

/// Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64gather_epi64&expand=3084)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_i64gather_epi64<const SCALE: i32>(
    offsets: __m512i,
    slice: *const u8,
) -> __m512i {
    static_assert_imm8_scale!(SCALE);
    let zero = _mm512_setzero_si512().as_i64x8();
    let neg_one = -1;
    let slice = slice as *const i8;
    let offsets = offsets.as_i64x8();
    let r = vpgatherqq(zero, slice, offsets, neg_one, SCALE);
    transmute(r)
}

/// Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64gather_epi64&expand=3085)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask_i64gather_epi64<const SCALE: i32>(
    src: __m512i,
    mask: __mmask8,
    offsets: __m512i,
    slice: *const u8,
) -> __m512i {
    static_assert_imm8_scale!(SCALE);
    let src = src.as_i64x8();
    let mask = mask as i8;
    let slice = slice as *const i8;
    let offsets = offsets.as_i64x8();
    let r = vpgatherqq(src, slice, offsets, mask, SCALE);
    transmute(r)
}

/// Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst. scale should be 1, 2, 4 or 8.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64gather_epi32&expand=3074)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_i64gather_epi32<const SCALE: i32>(
    offsets: __m512i,
    slice: *const u8,
) -> __m256i {
    static_assert_imm8_scale!(SCALE);
    let zeros = _mm256_setzero_si256().as_i32x8();
    let neg_one = -1;
    let slice = slice as *const i8;
    let offsets = offsets.as_i64x8();
    let r = vpgatherqd(zeros, slice, offsets, neg_one, SCALE);
    transmute(r)
}

/// Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). Gathered elements are merged into dst using writemask k (elements are copied from src when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64gather_epi32&expand=3075)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask_i64gather_epi32<const SCALE: i32>(
    src: __m256i,
    mask: __mmask8,
    offsets: __m512i,
    slice: *const u8,
) -> __m256i {
    static_assert_imm8_scale!(SCALE);
    let src = src.as_i32x8();
    let mask = mask as i8;
    let slice = slice as *const i8;
    let offsets = offsets.as_i64x8();
    let r = vpgatherqd(src, slice, offsets, mask, SCALE);
    transmute(r)
}

/// Scatter double-precision (64-bit) floating-point elements from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32scatter_pd&expand=3044)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vscatterdpd, SCALE = 1))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_i32scatter_pd<const SCALE: i32>(
    slice: *mut u8,
    offsets: __m256i,
    src: __m512d,
) {
    static_assert_imm8_scale!(SCALE);
    let src = src.as_f64x8();
    let neg_one = -1;
    let slice = slice as *mut i8;
    let offsets = offsets.as_i32x8();
    vscatterdpd(slice, neg_one, offsets, src, SCALE);
}

/// Scatter double-precision (64-bit) floating-point elements from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32scatter_pd&expand=3045)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vscatterdpd, SCALE = 1))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask_i32scatter_pd<const SCALE: i32>(
    slice: *mut u8,
    mask: __mmask8,
    offsets: __m256i,
    src: __m512d,
) {
    static_assert_imm8_scale!(SCALE);
    let src = src.as_f64x8();
    let slice = slice as *mut i8;
    let offsets = offsets.as_i32x8();
    vscatterdpd(slice, mask as i8, offsets, src, SCALE);
}

/// Scatter double-precision (64-bit) floating-point elements from a into memory using 64-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64scatter_pd&expand=3122)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vscatterqpd, SCALE = 1))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_i64scatter_pd<const SCALE: i32>(
    slice: *mut u8,
    offsets: __m512i,
    src: __m512d,
) {
    static_assert_imm8_scale!(SCALE);
    let src = src.as_f64x8();
    let neg_one = -1;
    let slice = slice as *mut i8;
    let offsets = offsets.as_i64x8();
    vscatterqpd(slice, neg_one, offsets, src, SCALE);
}

/// Scatter double-precision (64-bit) floating-point elements from a into memory using 64-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64scatter_pd&expand=3123)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vscatterqpd, SCALE = 1))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask_i64scatter_pd<const SCALE: i32>(
    slice: *mut u8,
    mask: __mmask8,
    offsets: __m512i,
    src: __m512d,
) {
    static_assert_imm8_scale!(SCALE);
    let src = src.as_f64x8();
    let slice = slice as *mut i8;
    let offsets = offsets.as_i64x8();
    vscatterqpd(slice, mask as i8, offsets, src, SCALE);
}

/// Scatter single-precision (32-bit) floating-point elements from a into memory using 32-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32scatter_ps&expand=3050)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vscatterdps, SCALE = 1))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_i32scatter_ps<const SCALE: i32>(
    slice: *mut u8,
    offsets: __m512i,
    src: __m512,
) {
    static_assert_imm8_scale!(SCALE);
    let src = src.as_f32x16();
    let neg_one = -1;
    let slice = slice as *mut i8;
    let offsets = offsets.as_i32x16();
    vscatterdps(slice, neg_one, offsets, src, SCALE);
}

/// Scatter single-precision (32-bit) floating-point elements from a into memory using 32-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32scatter_ps&expand=3051)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vscatterdps, SCALE = 1))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask_i32scatter_ps<const SCALE: i32>(
    slice: *mut u8,
    mask: __mmask16,
    offsets: __m512i,
    src: __m512,
) {
    static_assert_imm8_scale!(SCALE);
    let src = src.as_f32x16();
    let slice = slice as *mut i8;
    let offsets = offsets.as_i32x16();
    vscatterdps(slice, mask as i16, offsets, src, SCALE);
}

/// Scatter single-precision (32-bit) floating-point elements from a into memory using 64-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64scatter_ps&expand=3128)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vscatterqps, SCALE = 1))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_i64scatter_ps<const SCALE: i32>(
    slice: *mut u8,
    offsets: __m512i,
    src: __m256,
) {
    static_assert_imm8_scale!(SCALE);
    let src = src.as_f32x8();
    let neg_one = -1;
    let slice = slice as *mut i8;
    let offsets = offsets.as_i64x8();
    vscatterqps(slice, neg_one, offsets, src, SCALE);
}

/// Scatter single-precision (32-bit) floating-point elements from a into memory using 64-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64scatter_ps&expand=3129)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vscatterqps, SCALE = 1))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask_i64scatter_ps<const SCALE: i32>(
    slice: *mut u8,
    mask: __mmask8,
    offsets: __m512i,
    src: __m256,
) {
    static_assert_imm8_scale!(SCALE);
    let src = src.as_f32x8();
    let slice = slice as *mut i8;
    let offsets = offsets.as_i64x8();
    vscatterqps(slice, mask as i8, offsets, src, SCALE);
}

/// Scatter 64-bit integers from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32scatter_epi64&expand=3038)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpscatterdq, SCALE = 1))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_i32scatter_epi64<const SCALE: i32>(
    slice: *mut u8,
    offsets: __m256i,
    src: __m512i,
) {
    static_assert_imm8_scale!(SCALE);
    let src = src.as_i64x8();
    let neg_one = -1;
    let slice = slice as *mut i8;
    let offsets = offsets.as_i32x8();
    vpscatterdq(slice, neg_one, offsets, src, SCALE);
}

/// Scatter 64-bit integers from a into memory using 32-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32scatter_epi64&expand=3039)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpscatterdq, SCALE = 1))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask_i32scatter_epi64<const SCALE: i32>(
    slice: *mut u8,
    mask: __mmask8,
    offsets: __m256i,
    src: __m512i,
) {
    static_assert_imm8_scale!(SCALE);
    let src = src.as_i64x8();
    let mask = mask as i8;
    let slice = slice as *mut i8;
    let offsets = offsets.as_i32x8();
    vpscatterdq(slice, mask, offsets, src, SCALE);
}

/// Scatter 64-bit integers from a into memory using 64-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64scatter_epi64&expand=3116)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpscatterqq, SCALE = 1))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_i64scatter_epi64<const SCALE: i32>(
    slice: *mut u8,
    offsets: __m512i,
    src: __m512i,
) {
    static_assert_imm8_scale!(SCALE);
    let src = src.as_i64x8();
    let neg_one = -1;
    let slice = slice as *mut i8;
    let offsets = offsets.as_i64x8();
    vpscatterqq(slice, neg_one, offsets, src, SCALE);
}

/// Scatter 64-bit integers from a into memory using 64-bit indices. 64-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64scatter_epi64&expand=3117)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpscatterqq, SCALE = 1))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask_i64scatter_epi64<const SCALE: i32>(
    slice: *mut u8,
    mask: __mmask8,
    offsets: __m512i,
    src: __m512i,
) {
    static_assert_imm8_scale!(SCALE);
    let src = src.as_i64x8();
    let mask = mask as i8;
    let slice = slice as *mut i8;
    let offsets = offsets.as_i64x8();
    vpscatterqq(slice, mask, offsets, src, SCALE);
}

/// Scatter 32-bit integers from a into memory using 32-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i32scatter_epi32&expand=3032)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpscatterdd, SCALE = 1))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_i32scatter_epi32<const SCALE: i32>(
    slice: *mut u8,
    offsets: __m512i,
    src: __m512i,
) {
    static_assert_imm8_scale!(SCALE);
    let src = src.as_i32x16();
    let neg_one = -1;
    let slice = slice as *mut i8;
    let offsets = offsets.as_i32x16();
    vpscatterdd(slice, neg_one, offsets, src, SCALE);
}

/// Scatter 32-bit integers from a into memory using 32-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 32-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i32scatter_epi32&expand=3033)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpscatterdd, SCALE = 1))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask_i32scatter_epi32<const SCALE: i32>(
    slice: *mut u8,
    mask: __mmask16,
    offsets: __m512i,
    src: __m512i,
) {
    static_assert_imm8_scale!(SCALE);
    let src = src.as_i32x16();
    let mask = mask as i16;
    let slice = slice as *mut i8;
    let offsets = offsets.as_i32x16();
    vpscatterdd(slice, mask, offsets, src, SCALE);
}

/// Scatter 32-bit integers from a into memory using 64-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale). scale should be 1, 2, 4 or 8.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_i64scatter_epi32&expand=3108)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpscatterqd, SCALE = 1))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_i64scatter_epi32<const SCALE: i32>(
    slice: *mut u8,
    offsets: __m512i,
    src: __m256i,
) {
    static_assert_imm8_scale!(SCALE);
    let src = src.as_i32x8();
    let neg_one = -1;
    let slice = slice as *mut i8;
    let offsets = offsets.as_i64x8();
    vpscatterqd(slice, neg_one, offsets, src, SCALE);
}

/// Scatter 32-bit integers from a into memory using 64-bit indices. 32-bit elements are stored at addresses starting at base_addr and offset by each 64-bit element in vindex (each index is scaled by the factor in scale) subject to mask k (elements are not stored when the corresponding mask bit is not set). scale should be 1, 2, 4 or 8.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_i64scatter_epi32&expand=3109)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpscatterqd, SCALE = 1))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask_i64scatter_epi32<const SCALE: i32>(
    slice: *mut u8,
    mask: __mmask8,
    offsets: __m512i,
    src: __m256i,
) {
    static_assert_imm8_scale!(SCALE);
    let src = src.as_i32x8();
    let mask = mask as i8;
    let slice = slice as *mut i8;
    let offsets = offsets.as_i64x8();
    vpscatterqd(slice, mask, offsets, src, SCALE);
}

/// Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_compress_epi32&expand=1198)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpcompressd))]
pub unsafe fn _mm512_mask_compress_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
    transmute(vpcompressd(a.as_i32x16(), src.as_i32x16(), k))
}

/// Contiguously store the active 32-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_compress_epi32&expand=1199)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpcompressd))]
pub unsafe fn _mm512_maskz_compress_epi32(k: __mmask16, a: __m512i) -> __m512i {
    transmute(vpcompressd(
        a.as_i32x16(),
        _mm512_setzero_si512().as_i32x16(),
        k,
    ))
}

/// Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_compress_epi32&expand=1196)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcompressd))]
pub unsafe fn _mm256_mask_compress_epi32(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
    transmute(vpcompressd256(a.as_i32x8(), src.as_i32x8(), k))
}

/// Contiguously store the active 32-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_compress_epi32&expand=1197)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcompressd))]
pub unsafe fn _mm256_maskz_compress_epi32(k: __mmask8, a: __m256i) -> __m256i {
    transmute(vpcompressd256(
        a.as_i32x8(),
        _mm256_setzero_si256().as_i32x8(),
        k,
    ))
}

/// Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_compress_epi32&expand=1194)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcompressd))]
pub unsafe fn _mm_mask_compress_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
    transmute(vpcompressd128(a.as_i32x4(), src.as_i32x4(), k))
}

/// Contiguously store the active 32-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_compress_epi32&expand=1195)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcompressd))]
pub unsafe fn _mm_maskz_compress_epi32(k: __mmask8, a: __m128i) -> __m128i {
    transmute(vpcompressd128(
        a.as_i32x4(),
        _mm_setzero_si128().as_i32x4(),
        k,
    ))
}

/// Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_compress_epi64&expand=1204)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpcompressq))]
pub unsafe fn _mm512_mask_compress_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
    transmute(vpcompressq(a.as_i64x8(), src.as_i64x8(), k))
}

/// Contiguously store the active 64-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_compress_epi64&expand=1205)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpcompressq))]
pub unsafe fn _mm512_maskz_compress_epi64(k: __mmask8, a: __m512i) -> __m512i {
    transmute(vpcompressq(
        a.as_i64x8(),
        _mm512_setzero_si512().as_i64x8(),
        k,
    ))
}

/// Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_compress_epi64&expand=1202)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcompressq))]
pub unsafe fn _mm256_mask_compress_epi64(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
    transmute(vpcompressq256(a.as_i64x4(), src.as_i64x4(), k))
}

/// Contiguously store the active 64-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_compress_epi64&expand=1203)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcompressq))]
pub unsafe fn _mm256_maskz_compress_epi64(k: __mmask8, a: __m256i) -> __m256i {
    transmute(vpcompressq256(
        a.as_i64x4(),
        _mm256_setzero_si256().as_i64x4(),
        k,
    ))
}

/// Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_compress_epi64&expand=1200)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcompressq))]
pub unsafe fn _mm_mask_compress_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
    transmute(vpcompressq128(a.as_i64x2(), src.as_i64x2(), k))
}

/// Contiguously store the active 64-bit integers in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_compress_epi64&expand=1201)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcompressq))]
pub unsafe fn _mm_maskz_compress_epi64(k: __mmask8, a: __m128i) -> __m128i {
    transmute(vpcompressq128(
        a.as_i64x2(),
        _mm_setzero_si128().as_i64x2(),
        k,
    ))
}

/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_compress_ps&expand=1222)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcompressps))]
pub unsafe fn _mm512_mask_compress_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
    transmute(vcompressps(a.as_f32x16(), src.as_f32x16(), k))
}

/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_compress_ps&expand=1223)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcompressps))]
pub unsafe fn _mm512_maskz_compress_ps(k: __mmask16, a: __m512) -> __m512 {
    transmute(vcompressps(
        a.as_f32x16(),
        _mm512_setzero_ps().as_f32x16(),
        k,
    ))
}

/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_compress_ps&expand=1220)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcompressps))]
pub unsafe fn _mm256_mask_compress_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
    transmute(vcompressps256(a.as_f32x8(), src.as_f32x8(), k))
}

/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_compress_ps&expand=1221)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcompressps))]
pub unsafe fn _mm256_maskz_compress_ps(k: __mmask8, a: __m256) -> __m256 {
    transmute(vcompressps256(
        a.as_f32x8(),
        _mm256_setzero_ps().as_f32x8(),
        k,
    ))
}

/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_compress_ps&expand=1218)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcompressps))]
pub unsafe fn _mm_mask_compress_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
    transmute(vcompressps128(a.as_f32x4(), src.as_f32x4(), k))
}

/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_compress_ps&expand=1219)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcompressps))]
pub unsafe fn _mm_maskz_compress_ps(k: __mmask8, a: __m128) -> __m128 {
    transmute(vcompressps128(a.as_f32x4(), _mm_setzero_ps().as_f32x4(), k))
}

/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_compress_pd&expand=1216)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcompresspd))]
pub unsafe fn _mm512_mask_compress_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
    transmute(vcompresspd(a.as_f64x8(), src.as_f64x8(), k))
}

/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_compress_pd&expand=1217)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcompresspd))]
pub unsafe fn _mm512_maskz_compress_pd(k: __mmask8, a: __m512d) -> __m512d {
    transmute(vcompresspd(a.as_f64x8(), _mm512_setzero_pd().as_f64x8(), k))
}

/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_compress_pd&expand=1214)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcompresspd))]
pub unsafe fn _mm256_mask_compress_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
    transmute(vcompresspd256(a.as_f64x4(), src.as_f64x4(), k))
}

/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_compress_pd&expand=1215)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcompresspd))]
pub unsafe fn _mm256_maskz_compress_pd(k: __mmask8, a: __m256d) -> __m256d {
    transmute(vcompresspd256(
        a.as_f64x4(),
        _mm256_setzero_pd().as_f64x4(),
        k,
    ))
}

/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to dst, and pass through the remaining elements from src.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_compress_pd&expand=1212)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcompresspd))]
pub unsafe fn _mm_mask_compress_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
    transmute(vcompresspd128(a.as_f64x2(), src.as_f64x2(), k))
}

/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in zeromask k) to dst, and set the remaining elements to zero.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_compress_pd&expand=1213)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcompresspd))]
pub unsafe fn _mm_maskz_compress_pd(k: __mmask8, a: __m128d) -> __m128d {
    transmute(vcompresspd128(a.as_f64x2(), _mm_setzero_pd().as_f64x2(), k))
}

/// Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_compressstoreu_epi32)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpcompressd))]
pub unsafe fn _mm512_mask_compressstoreu_epi32(base_addr: *mut u8, k: __mmask16, a: __m512i) {
    vcompressstored(base_addr as *mut _, a.as_i32x16(), k)
}

/// Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_compressstoreu_epi32)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcompressd))]
pub unsafe fn _mm256_mask_compressstoreu_epi32(base_addr: *mut u8, k: __mmask8, a: __m256i) {
    vcompressstored256(base_addr as *mut _, a.as_i32x8(), k)
}

/// Contiguously store the active 32-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_compressstoreu_epi32)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcompressd))]
pub unsafe fn _mm_mask_compressstoreu_epi32(base_addr: *mut u8, k: __mmask8, a: __m128i) {
    vcompressstored128(base_addr as *mut _, a.as_i32x4(), k)
}

/// Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_compressstoreu_epi64)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpcompressq))]
pub unsafe fn _mm512_mask_compressstoreu_epi64(base_addr: *mut u8, k: __mmask8, a: __m512i) {
    vcompressstoreq(base_addr as *mut _, a.as_i64x8(), k)
}

/// Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_compressstoreu_epi64)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcompressq))]
pub unsafe fn _mm256_mask_compressstoreu_epi64(base_addr: *mut u8, k: __mmask8, a: __m256i) {
    vcompressstoreq256(base_addr as *mut _, a.as_i64x4(), k)
}

/// Contiguously store the active 64-bit integers in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_compressstoreu_epi64)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcompressq))]
pub unsafe fn _mm_mask_compressstoreu_epi64(base_addr: *mut u8, k: __mmask8, a: __m128i) {
    vcompressstoreq128(base_addr as *mut _, a.as_i64x2(), k)
}

/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_compressstoreu_ps)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcompressps))]
pub unsafe fn _mm512_mask_compressstoreu_ps(base_addr: *mut u8, k: __mmask16, a: __m512) {
    vcompressstoreps(base_addr as *mut _, a.as_f32x16(), k)
}

/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_compressstoreu_ps)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcompressps))]
pub unsafe fn _mm256_mask_compressstoreu_ps(base_addr: *mut u8, k: __mmask8, a: __m256) {
    vcompressstoreps256(base_addr as *mut _, a.as_f32x8(), k)
}

/// Contiguously store the active single-precision (32-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_compressstoreu_ps)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcompressps))]
pub unsafe fn _mm_mask_compressstoreu_ps(base_addr: *mut u8, k: __mmask8, a: __m128) {
    vcompressstoreps128(base_addr as *mut _, a.as_f32x4(), k)
}

/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_compressstoreu_pd)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcompresspd))]
pub unsafe fn _mm512_mask_compressstoreu_pd(base_addr: *mut u8, k: __mmask8, a: __m512d) {
    vcompressstorepd(base_addr as *mut _, a.as_f64x8(), k)
}

/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_compressstoreu_pd)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcompresspd))]
pub unsafe fn _mm256_mask_compressstoreu_pd(base_addr: *mut u8, k: __mmask8, a: __m256d) {
    vcompressstorepd256(base_addr as *mut _, a.as_f64x4(), k)
}

/// Contiguously store the active double-precision (64-bit) floating-point elements in a (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_compressstoreu_pd)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vcompresspd))]
pub unsafe fn _mm_mask_compressstoreu_pd(base_addr: *mut u8, k: __mmask8, a: __m128d) {
    vcompressstorepd128(base_addr as *mut _, a.as_f64x2(), k)
}

/// Load contiguous active 32-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_expand_epi32&expand=2316)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpexpandd))]
pub unsafe fn _mm512_mask_expand_epi32(src: __m512i, k: __mmask16, a: __m512i) -> __m512i {
    transmute(vpexpandd(a.as_i32x16(), src.as_i32x16(), k))
}

/// Load contiguous active 32-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_expand_epi32&expand=2317)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpexpandd))]
pub unsafe fn _mm512_maskz_expand_epi32(k: __mmask16, a: __m512i) -> __m512i {
    transmute(vpexpandd(
        a.as_i32x16(),
        _mm512_setzero_si512().as_i32x16(),
        k,
    ))
}

/// Load contiguous active 32-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_expand_epi32&expand=2314)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpexpandd))]
pub unsafe fn _mm256_mask_expand_epi32(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
    transmute(vpexpandd256(a.as_i32x8(), src.as_i32x8(), k))
}

/// Load contiguous active 32-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_expand_epi32&expand=2315)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpexpandd))]
pub unsafe fn _mm256_maskz_expand_epi32(k: __mmask8, a: __m256i) -> __m256i {
    transmute(vpexpandd256(
        a.as_i32x8(),
        _mm256_setzero_si256().as_i32x8(),
        k,
    ))
}

/// Load contiguous active 32-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_expand_epi32&expand=2312)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpexpandd))]
pub unsafe fn _mm_mask_expand_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
    transmute(vpexpandd128(a.as_i32x4(), src.as_i32x4(), k))
}

/// Load contiguous active 32-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_expand_epi32&expand=2313)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpexpandd))]
pub unsafe fn _mm_maskz_expand_epi32(k: __mmask8, a: __m128i) -> __m128i {
    transmute(vpexpandd128(
        a.as_i32x4(),
        _mm_setzero_si128().as_i32x4(),
        k,
    ))
}

/// Load contiguous active 64-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_expand_epi64&expand=2322)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpexpandq))]
pub unsafe fn _mm512_mask_expand_epi64(src: __m512i, k: __mmask8, a: __m512i) -> __m512i {
    transmute(vpexpandq(a.as_i64x8(), src.as_i64x8(), k))
}

/// Load contiguous active 64-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_expand_epi64&expand=2323)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpexpandq))]
pub unsafe fn _mm512_maskz_expand_epi64(k: __mmask8, a: __m512i) -> __m512i {
    transmute(vpexpandq(
        a.as_i64x8(),
        _mm512_setzero_si512().as_i64x8(),
        k,
    ))
}

/// Load contiguous active 64-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_expand_epi64&expand=2320)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpexpandq))]
pub unsafe fn _mm256_mask_expand_epi64(src: __m256i, k: __mmask8, a: __m256i) -> __m256i {
    transmute(vpexpandq256(a.as_i64x4(), src.as_i64x4(), k))
}

/// Load contiguous active 64-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_expand_epi64&expand=2321)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpexpandq))]
pub unsafe fn _mm256_maskz_expand_epi64(k: __mmask8, a: __m256i) -> __m256i {
    transmute(vpexpandq256(
        a.as_i64x4(),
        _mm256_setzero_si256().as_i64x4(),
        k,
    ))
}

/// Load contiguous active 64-bit integers from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_expand_epi64&expand=2318)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpexpandq))]
pub unsafe fn _mm_mask_expand_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
    transmute(vpexpandq128(a.as_i64x2(), src.as_i64x2(), k))
}

/// Load contiguous active 64-bit integers from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_expand_epi64&expand=2319)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpexpandq))]
pub unsafe fn _mm_maskz_expand_epi64(k: __mmask8, a: __m128i) -> __m128i {
    transmute(vpexpandq128(
        a.as_i64x2(),
        _mm_setzero_si128().as_i64x2(),
        k,
    ))
}

/// Load contiguous active single-precision (32-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_expand_ps&expand=2340)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vexpandps))]
pub unsafe fn _mm512_mask_expand_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
    transmute(vexpandps(a.as_f32x16(), src.as_f32x16(), k))
}

/// Load contiguous active single-precision (32-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_expand_ps&expand=2341)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vexpandps))]
pub unsafe fn _mm512_maskz_expand_ps(k: __mmask16, a: __m512) -> __m512 {
    transmute(vexpandps(a.as_f32x16(), _mm512_setzero_ps().as_f32x16(), k))
}

/// Load contiguous active single-precision (32-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_expand_ps&expand=2338)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vexpandps))]
pub unsafe fn _mm256_mask_expand_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
    transmute(vexpandps256(a.as_f32x8(), src.as_f32x8(), k))
}

/// Load contiguous active single-precision (32-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_expand_ps&expand=2339)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vexpandps))]
pub unsafe fn _mm256_maskz_expand_ps(k: __mmask8, a: __m256) -> __m256 {
    transmute(vexpandps256(
        a.as_f32x8(),
        _mm256_setzero_ps().as_f32x8(),
        k,
    ))
}

/// Load contiguous active single-precision (32-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_expand_ps&expand=2336)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vexpandps))]
pub unsafe fn _mm_mask_expand_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
    transmute(vexpandps128(a.as_f32x4(), src.as_f32x4(), k))
}

/// Load contiguous active single-precision (32-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_expand_ps&expand=2337)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vexpandps))]
pub unsafe fn _mm_maskz_expand_ps(k: __mmask8, a: __m128) -> __m128 {
    transmute(vexpandps128(a.as_f32x4(), _mm_setzero_ps().as_f32x4(), k))
}

/// Load contiguous active double-precision (64-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_expand_pd&expand=2334)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vexpandpd))]
pub unsafe fn _mm512_mask_expand_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
    transmute(vexpandpd(a.as_f64x8(), src.as_f64x8(), k))
}

/// Load contiguous active double-precision (64-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_expand_pd&expand=2335)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vexpandpd))]
pub unsafe fn _mm512_maskz_expand_pd(k: __mmask8, a: __m512d) -> __m512d {
    transmute(vexpandpd(a.as_f64x8(), _mm512_setzero_pd().as_f64x8(), k))
}

/// Load contiguous active double-precision (64-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_expand_pd&expand=2332)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vexpandpd))]
pub unsafe fn _mm256_mask_expand_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
    transmute(vexpandpd256(a.as_f64x4(), src.as_f64x4(), k))
}

/// Load contiguous active double-precision (64-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_expand_pd&expand=2333)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vexpandpd))]
pub unsafe fn _mm256_maskz_expand_pd(k: __mmask8, a: __m256d) -> __m256d {
    transmute(vexpandpd256(
        a.as_f64x4(),
        _mm256_setzero_pd().as_f64x4(),
        k,
    ))
}

/// Load contiguous active double-precision (64-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_expand_pd&expand=2330)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vexpandpd))]
pub unsafe fn _mm_mask_expand_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
    transmute(vexpandpd128(a.as_f64x2(), src.as_f64x2(), k))
}

/// Load contiguous active double-precision (64-bit) floating-point elements from a (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_expand_pd&expand=2331)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vexpandpd))]
pub unsafe fn _mm_maskz_expand_pd(k: __mmask8, a: __m128d) -> __m128d {
    transmute(vexpandpd128(a.as_f64x2(), _mm_setzero_pd().as_f64x2(), k))
}

/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rol_epi32&expand=4685)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm512_rol_epi32<const IMM8: i32>(a: __m512i) -> __m512i {
    static_assert_imm8!(IMM8);
    let a = a.as_i32x16();
    let r = vprold(a, IMM8);
    transmute(r)
}

/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rol_epi32&expand=4683)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_mask_rol_epi32<const IMM8: i32>(
    src: __m512i,
    k: __mmask16,
    a: __m512i,
) -> __m512i {
    static_assert_imm8!(IMM8);
    let a = a.as_i32x16();
    let r = vprold(a, IMM8);
    transmute(simd_select_bitmask(k, r, src.as_i32x16()))
}

/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rol_epi32&expand=4684)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_maskz_rol_epi32<const IMM8: i32>(k: __mmask16, a: __m512i) -> __m512i {
    static_assert_imm8!(IMM8);
    let a = a.as_i32x16();
    let r = vprold(a, IMM8);
    let zero = _mm512_setzero_si512().as_i32x16();
    transmute(simd_select_bitmask(k, r, zero))
}

/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_rol_epi32&expand=4682)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm256_rol_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
    static_assert_imm8!(IMM8);
    let a = a.as_i32x8();
    let r = vprold256(a, IMM8);
    transmute(r)
}

/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_rol_epi32&expand=4680)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm256_mask_rol_epi32<const IMM8: i32>(
    src: __m256i,
    k: __mmask8,
    a: __m256i,
) -> __m256i {
    static_assert_imm8!(IMM8);
    let a = a.as_i32x8();
    let r = vprold256(a, IMM8);
    transmute(simd_select_bitmask(k, r, src.as_i32x8()))
}

/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_rol_epi32&expand=4681)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm256_maskz_rol_epi32<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m256i {
    static_assert_imm8!(IMM8);
    let a = a.as_i32x8();
    let r = vprold256(a, IMM8);
    let zero = _mm256_setzero_si256().as_i32x8();
    transmute(simd_select_bitmask(k, r, zero))
}

/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rol_epi32&expand=4679)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm_rol_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
    static_assert_imm8!(IMM8);
    let a = a.as_i32x4();
    let r = vprold128(a, IMM8);
    transmute(r)
}

/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rol_epi32&expand=4677)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm_mask_rol_epi32<const IMM8: i32>(
    src: __m128i,
    k: __mmask8,
    a: __m128i,
) -> __m128i {
    static_assert_imm8!(IMM8);
    let a = a.as_i32x4();
    let r = vprold128(a, IMM8);
    transmute(simd_select_bitmask(k, r, src.as_i32x4()))
}

/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rol_epi32&expand=4678)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm_maskz_rol_epi32<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
    static_assert_imm8!(IMM8);
    let a = a.as_i32x4();
    let r = vprold128(a, IMM8);
    let zero = _mm_setzero_si128().as_i32x4();
    transmute(simd_select_bitmask(k, r, zero))
}

/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_ror_epi32&expand=4721)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm512_ror_epi32<const IMM8: i32>(a: __m512i) -> __m512i {
    static_assert_imm8!(IMM8);
    let a = a.as_i32x16();
    let r = vprord(a, IMM8);
    transmute(r)
}

/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_ror_epi32&expand=4719)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_mask_ror_epi32<const IMM8: i32>(
    src: __m512i,
    k: __mmask16,
    a: __m512i,
) -> __m512i {
    static_assert_imm8!(IMM8);
    let a = a.as_i32x16();
    let r = vprord(a, IMM8);
    transmute(simd_select_bitmask(k, r, src.as_i32x16()))
}

/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_ror_epi32&expand=4720)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_maskz_ror_epi32<const IMM8: i32>(k: __mmask16, a: __m512i) -> __m512i {
    static_assert_imm8!(IMM8);
    let a = a.as_i32x16();
    let r = vprord(a, IMM8);
    let zero = _mm512_setzero_si512().as_i32x16();
    transmute(simd_select_bitmask(k, r, zero))
}

/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_ror_epi32&expand=4718)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm256_ror_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
    static_assert_imm8!(IMM8);
    let a = a.as_i32x8();
    let r = vprord256(a, IMM8);
    transmute(r)
}

/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_ror_epi32&expand=4716)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm256_mask_ror_epi32<const IMM8: i32>(
    src: __m256i,
    k: __mmask8,
    a: __m256i,
) -> __m256i {
    static_assert_imm8!(IMM8);
    let a = a.as_i32x8();
    let r = vprord256(a, IMM8);
    transmute(simd_select_bitmask(k, r, src.as_i32x8()))
}

/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_ror_epi32&expand=4717)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm256_maskz_ror_epi32<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m256i {
    static_assert_imm8!(IMM8);
    let a = a.as_i32x8();
    let r = vprord256(a, IMM8);
    let zero = _mm256_setzero_si256().as_i32x8();
    transmute(simd_select_bitmask(k, r, zero))
}

/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ror_epi32&expand=4715)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vprold, IMM8 = 1))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm_ror_epi32<const IMM8: i32>(a: __m128i) -> __m128i {
    static_assert_imm8!(IMM8);
    let a = a.as_i32x4();
    let r = vprord128(a, IMM8);
    transmute(r)
}

/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_ror_epi32&expand=4713)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm_mask_ror_epi32<const IMM8: i32>(
    src: __m128i,
    k: __mmask8,
    a: __m128i,
) -> __m128i {
    static_assert_imm8!(IMM8);
    let a = a.as_i32x4();
    let r = vprord128(a, IMM8);
    transmute(simd_select_bitmask(k, r, src.as_i32x4()))
}

/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_ror_epi32&expand=4714)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vprold, IMM8 = 123))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm_maskz_ror_epi32<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
    static_assert_imm8!(IMM8);
    let a = a.as_i32x4();
    let r = vprord128(a, IMM8);
    let zero = _mm_setzero_si128().as_i32x4();
    transmute(simd_select_bitmask(k, r, zero))
}

/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rol_epi64&expand=4694)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm512_rol_epi64<const IMM8: i32>(a: __m512i) -> __m512i {
    static_assert_imm8!(IMM8);
    let a = a.as_i64x8();
    let r = vprolq(a, IMM8);
    transmute(r)
}

/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rol_epi64&expand=4692)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_mask_rol_epi64<const IMM8: i32>(
    src: __m512i,
    k: __mmask8,
    a: __m512i,
) -> __m512i {
    static_assert_imm8!(IMM8);
    let a = a.as_i64x8();
    let r = vprolq(a, IMM8);
    transmute(simd_select_bitmask(k, r, src.as_i64x8()))
}

/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rol_epi64&expand=4693)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_maskz_rol_epi64<const IMM8: i32>(k: __mmask8, a: __m512i) -> __m512i {
    static_assert_imm8!(IMM8);
    let a = a.as_i64x8();
    let r = vprolq(a, IMM8);
    let zero = _mm512_setzero_si512().as_i64x8();
    transmute(simd_select_bitmask(k, r, zero))
}

/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_rol_epi64&expand=4691)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm256_rol_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
    static_assert_imm8!(IMM8);
    let a = a.as_i64x4();
    let r = vprolq256(a, IMM8);
    transmute(r)
}

/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_rol_epi64&expand=4689)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm256_mask_rol_epi64<const IMM8: i32>(
    src: __m256i,
    k: __mmask8,
    a: __m256i,
) -> __m256i {
    static_assert_imm8!(IMM8);
    let a = a.as_i64x4();
    let r = vprolq256(a, IMM8);
    transmute(simd_select_bitmask(k, r, src.as_i64x4()))
}

/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_rol_epi64&expand=4690)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm256_maskz_rol_epi64<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m256i {
    static_assert_imm8!(IMM8);
    let a = a.as_i64x4();
    let r = vprolq256(a, IMM8);
    let zero = _mm256_setzero_si256().as_i64x4();
    transmute(simd_select_bitmask(k, r, zero))
}

/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rol_epi64&expand=4688)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm_rol_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
    static_assert_imm8!(IMM8);
    let a = a.as_i64x2();
    let r = vprolq128(a, IMM8);
    transmute(r)
}

/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rol_epi64&expand=4686)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm_mask_rol_epi64<const IMM8: i32>(
    src: __m128i,
    k: __mmask8,
    a: __m128i,
) -> __m128i {
    static_assert_imm8!(IMM8);
    let a = a.as_i64x2();
    let r = vprolq128(a, IMM8);
    transmute(simd_select_bitmask(k, r, src.as_i64x2()))
}

/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rol_epi64&expand=4687)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vprolq, IMM8 = 1))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm_maskz_rol_epi64<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
    static_assert_imm8!(IMM8);
    let a = a.as_i64x2();
    let r = vprolq128(a, IMM8);
    let zero = _mm_setzero_si128().as_i64x2();
    transmute(simd_select_bitmask(k, r, zero))
}

/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_ror_epi64&expand=4730)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm512_ror_epi64<const IMM8: i32>(a: __m512i) -> __m512i {
    static_assert_imm8!(IMM8);
    let a = a.as_i64x8();
    let r = vprorq(a, IMM8);
    transmute(r)
}

/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_ror_epi64&expand=4728)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_mask_ror_epi64<const IMM8: i32>(
    src: __m512i,
    k: __mmask8,
    a: __m512i,
) -> __m512i {
    static_assert_imm8!(IMM8);
    let a = a.as_i64x8();
    let r = vprorq(a, IMM8);
    transmute(simd_select_bitmask(k, r, src.as_i64x8()))
}

/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_ror_epi64&expand=4729)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_maskz_ror_epi64<const IMM8: i32>(k: __mmask8, a: __m512i) -> __m512i {
    static_assert_imm8!(IMM8);
    let a = a.as_i64x8();
    let r = vprorq(a, IMM8);
    let zero = _mm512_setzero_si512().as_i64x8();
    transmute(simd_select_bitmask(k, r, zero))
}

/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_ror_epi64&expand=4727)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm256_ror_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
    static_assert_imm8!(IMM8);
    let a = a.as_i64x4();
    let r = vprorq256(a, IMM8);
    transmute(r)
}

/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_ror_epi64&expand=4725)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm256_mask_ror_epi64<const IMM8: i32>(
    src: __m256i,
    k: __mmask8,
    a: __m256i,
) -> __m256i {
    static_assert_imm8!(IMM8);
    let a = a.as_i64x4();
    let r = vprorq256(a, IMM8);
    transmute(simd_select_bitmask(k, r, src.as_i64x4()))
}

/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_ror_epi64&expand=4726)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm256_maskz_ror_epi64<const IMM8: i32>(k: __mmask8, a: __m256i) -> __m256i {
    static_assert_imm8!(IMM8);
    let a = a.as_i64x4();
    let r = vprorq256(a, IMM8);
    let zero = _mm256_setzero_si256().as_i64x4();
    transmute(simd_select_bitmask(k, r, zero))
}

/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ror_epi64&expand=4724)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm_ror_epi64<const IMM8: i32>(a: __m128i) -> __m128i {
    static_assert_imm8!(IMM8);
    let a = a.as_i64x2();
    let r = vprorq128(a, IMM8);
    transmute(r)
}

/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_ror_epi64&expand=4722)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm_mask_ror_epi64<const IMM8: i32>(
    src: __m128i,
    k: __mmask8,
    a: __m128i,
) -> __m128i {
    static_assert_imm8!(IMM8);
    let a = a.as_i64x2();
    let r = vprorq128(a, IMM8);
    transmute(simd_select_bitmask(k, r, src.as_i64x2()))
}

/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_ror_epi64&expand=4723)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vprolq, IMM8 = 15))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm_maskz_ror_epi64<const IMM8: i32>(k: __mmask8, a: __m128i) -> __m128i {
    static_assert_imm8!(IMM8);
    let a = a.as_i64x2();
    let r = vprorq128(a, IMM8);
    let zero = _mm_setzero_si128().as_i64x2();
    transmute(simd_select_bitmask(k, r, zero))
}

/// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_slli_epi32&expand=5310)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpslld, IMM8 = 5))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm512_slli_epi32<const IMM8: u32>(a: __m512i) -> __m512i {
    static_assert_imm_u8!(IMM8);
    let a = a.as_i32x16();
    let r = vpsllid(a, IMM8);
    transmute(r)
}

/// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_slli_epi32&expand=5308)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpslld, IMM8 = 5))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_mask_slli_epi32<const IMM8: u32>(
    src: __m512i,
    k: __mmask16,
    a: __m512i,
) -> __m512i {
    static_assert_imm_u8!(IMM8);
    let a = a.as_i32x16();
    let shf = vpsllid(a, IMM8);
    transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
}

/// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_slli_epi32&expand=5309)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpslld, IMM8 = 5))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_maskz_slli_epi32<const IMM8: u32>(k: __mmask16, a: __m512i) -> __m512i {
    static_assert_imm_u8!(IMM8);
    let a = a.as_i32x16();
    let shf = vpsllid(a, IMM8);
    let zero = _mm512_setzero_si512().as_i32x16();
    transmute(simd_select_bitmask(k, shf, zero))
}

/// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_slli_epi32&expand=5305)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpslld, IMM8 = 5))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm256_mask_slli_epi32<const IMM8: u32>(
    src: __m256i,
    k: __mmask8,
    a: __m256i,
) -> __m256i {
    static_assert_imm_u8!(IMM8);
    let imm8 = IMM8 as i32;
    let r = psllid256(a.as_i32x8(), imm8);
    transmute(simd_select_bitmask(k, r, src.as_i32x8()))
}

/// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_slli_epi32&expand=5306)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpslld, IMM8 = 5))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm256_maskz_slli_epi32<const IMM8: u32>(k: __mmask8, a: __m256i) -> __m256i {
    static_assert_imm_u8!(IMM8);
    let imm8 = IMM8 as i32;
    let r = psllid256(a.as_i32x8(), imm8);
    let zero = _mm256_setzero_si256().as_i32x8();
    transmute(simd_select_bitmask(k, r, zero))
}

/// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_slli_epi32&expand=5302)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpslld, IMM8 = 5))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm_mask_slli_epi32<const IMM8: u32>(
    src: __m128i,
    k: __mmask8,
    a: __m128i,
) -> __m128i {
    static_assert_imm_u8!(IMM8);
    let imm8 = IMM8 as i32;
    let r = psllid128(a.as_i32x4(), imm8);
    transmute(simd_select_bitmask(k, r, src.as_i32x4()))
}

/// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_slli_epi32&expand=5303)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpslld, IMM8 = 5))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm_maskz_slli_epi32<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
    static_assert_imm_u8!(IMM8);
    let imm8 = IMM8 as i32;
    let r = psllid128(a.as_i32x4(), imm8);
    let zero = _mm_setzero_si128().as_i32x4();
    transmute(simd_select_bitmask(k, r, zero))
}

/// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_srli_epi32&expand=5522)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpsrld, IMM8 = 1))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm512_srli_epi32<const IMM8: u32>(a: __m512i) -> __m512i {
    static_assert_imm_u8!(IMM8);
    let a = a.as_i32x16();
    let r = vpsrlid(a, IMM8);
    transmute(r)
}

/// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_srli_epi32&expand=5520)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpsrld, IMM8 = 1))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_mask_srli_epi32<const IMM8: u32>(
    src: __m512i,
    k: __mmask16,
    a: __m512i,
) -> __m512i {
    static_assert_imm_u8!(IMM8);
    let a = a.as_i32x16();
    let shf = vpsrlid(a, IMM8);
    transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
}

/// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_srli_epi32&expand=5521)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpsrld, IMM8 = 1))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_maskz_srli_epi32<const IMM8: u32>(k: __mmask16, a: __m512i) -> __m512i {
    static_assert_imm_u8!(IMM8);
    let a = a.as_i32x16();
    let shf = vpsrlid(a, IMM8);
    let zero = _mm512_setzero_si512().as_i32x16();
    transmute(simd_select_bitmask(k, shf, zero))
}

/// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_srli_epi32&expand=5517)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsrld, IMM8 = 1))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm256_mask_srli_epi32<const IMM8: u32>(
    src: __m256i,
    k: __mmask8,
    a: __m256i,
) -> __m256i {
    static_assert_imm_u8!(IMM8);
    let imm8 = IMM8 as i32;
    let r = psrlid256(a.as_i32x8(), imm8);
    transmute(simd_select_bitmask(k, r, src.as_i32x8()))
}

/// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_srli_epi32&expand=5518)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsrld, IMM8 = 1))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm256_maskz_srli_epi32<const IMM8: u32>(k: __mmask8, a: __m256i) -> __m256i {
    static_assert_imm_u8!(IMM8);
    let imm8 = IMM8 as i32;
    let r = psrlid256(a.as_i32x8(), imm8);
    let zero = _mm256_setzero_si256().as_i32x8();
    transmute(simd_select_bitmask(k, r, zero))
}

/// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_srli_epi32&expand=5514)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsrld, IMM8 = 1))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm_mask_srli_epi32<const IMM8: u32>(
    src: __m128i,
    k: __mmask8,
    a: __m128i,
) -> __m128i {
    static_assert_imm_u8!(IMM8);
    let imm8 = IMM8 as i32;
    let r = psrlid128(a.as_i32x4(), imm8);
    transmute(simd_select_bitmask(k, r, src.as_i32x4()))
}

/// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_srli_epi32&expand=5515)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsrld, IMM8 = 1))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm_maskz_srli_epi32<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
    static_assert_imm_u8!(IMM8);
    let imm8 = IMM8 as i32;
    let r = psrlid128(a.as_i32x4(), imm8);
    let zero = _mm_setzero_si128().as_i32x4();
    transmute(simd_select_bitmask(k, r, zero))
}

/// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_slli_epi64&expand=5319)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpsllq, IMM8 = 5))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm512_slli_epi64<const IMM8: u32>(a: __m512i) -> __m512i {
    static_assert_imm_u8!(IMM8);
    let a = a.as_i64x8();
    let r = vpslliq(a, IMM8);
    transmute(r)
}

/// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_slli_epi64&expand=5317)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpsllq, IMM8 = 5))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_mask_slli_epi64<const IMM8: u32>(
    src: __m512i,
    k: __mmask8,
    a: __m512i,
) -> __m512i {
    static_assert_imm_u8!(IMM8);
    let a = a.as_i64x8();
    let shf = vpslliq(a, IMM8);
    transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
}

/// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_slli_epi64&expand=5318)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpsllq, IMM8 = 5))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_maskz_slli_epi64<const IMM8: u32>(k: __mmask8, a: __m512i) -> __m512i {
    static_assert_imm_u8!(IMM8);
    let a = a.as_i64x8();
    let shf = vpslliq(a, IMM8);
    let zero = _mm512_setzero_si512().as_i64x8();
    transmute(simd_select_bitmask(k, shf, zero))
}

/// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_slli_epi64&expand=5314)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsllq, IMM8 = 5))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm256_mask_slli_epi64<const IMM8: u32>(
    src: __m256i,
    k: __mmask8,
    a: __m256i,
) -> __m256i {
    static_assert_imm_u8!(IMM8);
    let imm8 = IMM8 as i32;
    let r = pslliq256(a.as_i64x4(), imm8);
    transmute(simd_select_bitmask(k, r, src.as_i64x4()))
}

/// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_slli_epi64&expand=5315)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsllq, IMM8 = 5))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm256_maskz_slli_epi64<const IMM8: u32>(k: __mmask8, a: __m256i) -> __m256i {
    static_assert_imm_u8!(IMM8);
    let imm8 = IMM8 as i32;
    let r = pslliq256(a.as_i64x4(), imm8);
    let zero = _mm256_setzero_si256().as_i64x4();
    transmute(simd_select_bitmask(k, r, zero))
}

/// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_slli_epi64&expand=5311)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsllq, IMM8 = 5))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm_mask_slli_epi64<const IMM8: u32>(
    src: __m128i,
    k: __mmask8,
    a: __m128i,
) -> __m128i {
    static_assert_imm_u8!(IMM8);
    let imm8 = IMM8 as i32;
    let r = pslliq128(a.as_i64x2(), imm8);
    transmute(simd_select_bitmask(k, r, src.as_i64x2()))
}

/// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_slli_epi64&expand=5312)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsllq, IMM8 = 5))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm_maskz_slli_epi64<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
    static_assert_imm_u8!(IMM8);
    let imm8 = IMM8 as i32;
    let r = pslliq128(a.as_i64x2(), imm8);
    let zero = _mm_setzero_si128().as_i64x2();
    transmute(simd_select_bitmask(k, r, zero))
}

/// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_srli_epi64&expand=5531)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpsrlq, IMM8 = 1))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm512_srli_epi64<const IMM8: u32>(a: __m512i) -> __m512i {
    static_assert_imm_u8!(IMM8);
    let a = a.as_i64x8();
    let r = vpsrliq(a, IMM8);
    transmute(r)
}

/// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_srli_epi64&expand=5529)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpsrlq, IMM8 = 1))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_mask_srli_epi64<const IMM8: u32>(
    src: __m512i,
    k: __mmask8,
    a: __m512i,
) -> __m512i {
    static_assert_imm_u8!(IMM8);
    let a = a.as_i64x8();
    let shf = vpsrliq(a, IMM8);
    transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
}

/// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_srli_epi64&expand=5530)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpsrlq, IMM8 = 1))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_maskz_srli_epi64<const IMM8: u32>(k: __mmask8, a: __m512i) -> __m512i {
    static_assert_imm_u8!(IMM8);
    let a = a.as_i64x8();
    let shf = vpsrliq(a, IMM8);
    let zero = _mm512_setzero_si512().as_i64x8();
    transmute(simd_select_bitmask(k, shf, zero))
}

/// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_srli_epi64&expand=5526)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsrlq, IMM8 = 1))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm256_mask_srli_epi64<const IMM8: u32>(
    src: __m256i,
    k: __mmask8,
    a: __m256i,
) -> __m256i {
    static_assert_imm_u8!(IMM8);
    let imm8 = IMM8 as i32;
    let r = psrliq256(a.as_i64x4(), imm8);
    transmute(simd_select_bitmask(k, r, src.as_i64x4()))
}

/// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_srli_epi64&expand=5527)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsrlq, IMM8 = 1))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm256_maskz_srli_epi64<const IMM8: u32>(k: __mmask8, a: __m256i) -> __m256i {
    static_assert_imm_u8!(IMM8);
    let imm8 = IMM8 as i32;
    let r = psrliq256(a.as_i64x4(), imm8);
    let zero = _mm256_setzero_si256().as_i64x4();
    transmute(simd_select_bitmask(k, r, zero))
}

/// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_srli_epi64&expand=5523)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsrlq, IMM8 = 1))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm_mask_srli_epi64<const IMM8: u32>(
    src: __m128i,
    k: __mmask8,
    a: __m128i,
) -> __m128i {
    static_assert_imm_u8!(IMM8);
    let imm8 = IMM8 as i32;
    let r = psrliq128(a.as_i64x2(), imm8);
    transmute(simd_select_bitmask(k, r, src.as_i64x2()))
}

/// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_srli_epi64&expand=5524)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsrlq, IMM8 = 1))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm_maskz_srli_epi64<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
    static_assert_imm_u8!(IMM8);
    let imm8 = IMM8 as i32;
    let r = psrliq128(a.as_i64x2(), imm8);
    let zero = _mm_setzero_si128().as_i64x2();
    transmute(simd_select_bitmask(k, r, zero))
}

/// Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sll_epi32&expand=5280)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpslld))]
pub unsafe fn _mm512_sll_epi32(a: __m512i, count: __m128i) -> __m512i {
    transmute(vpslld(a.as_i32x16(), count.as_i32x4()))
}

/// Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sll_epi32&expand=5278)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpslld))]
pub unsafe fn _mm512_mask_sll_epi32(
    src: __m512i,
    k: __mmask16,
    a: __m512i,
    count: __m128i,
) -> __m512i {
    let shf = _mm512_sll_epi32(a, count).as_i32x16();
    transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
}

/// Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sll_epi32&expand=5279)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpslld))]
pub unsafe fn _mm512_maskz_sll_epi32(k: __mmask16, a: __m512i, count: __m128i) -> __m512i {
    let shf = _mm512_sll_epi32(a, count).as_i32x16();
    let zero = _mm512_setzero_si512().as_i32x16();
    transmute(simd_select_bitmask(k, shf, zero))
}

/// Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_sll_epi32&expand=5275)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpslld))]
pub unsafe fn _mm256_mask_sll_epi32(
    src: __m256i,
    k: __mmask8,
    a: __m256i,
    count: __m128i,
) -> __m256i {
    let shf = _mm256_sll_epi32(a, count).as_i32x8();
    transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
}

/// Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_sll_epi32&expand=5276)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpslld))]
pub unsafe fn _mm256_maskz_sll_epi32(k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
    let shf = _mm256_sll_epi32(a, count).as_i32x8();
    let zero = _mm256_setzero_si256().as_i32x8();
    transmute(simd_select_bitmask(k, shf, zero))
}

/// Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sll_epi32&expand=5272)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpslld))]
pub unsafe fn _mm_mask_sll_epi32(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
    let shf = _mm_sll_epi32(a, count).as_i32x4();
    transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
}

/// Shift packed 32-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sll_epi32&expand=5273)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpslld))]
pub unsafe fn _mm_maskz_sll_epi32(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
    let shf = _mm_sll_epi32(a, count).as_i32x4();
    let zero = _mm_setzero_si128().as_i32x4();
    transmute(simd_select_bitmask(k, shf, zero))
}

/// Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_srl_epi32&expand=5492)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpsrld))]
pub unsafe fn _mm512_srl_epi32(a: __m512i, count: __m128i) -> __m512i {
    transmute(vpsrld(a.as_i32x16(), count.as_i32x4()))
}

/// Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_srl_epi32&expand=5490)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpsrld))]
pub unsafe fn _mm512_mask_srl_epi32(
    src: __m512i,
    k: __mmask16,
    a: __m512i,
    count: __m128i,
) -> __m512i {
    let shf = _mm512_srl_epi32(a, count).as_i32x16();
    transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
}

/// Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_srl_epi32&expand=5491)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpsrld))]
pub unsafe fn _mm512_maskz_srl_epi32(k: __mmask16, a: __m512i, count: __m128i) -> __m512i {
    let shf = _mm512_srl_epi32(a, count).as_i32x16();
    let zero = _mm512_setzero_si512().as_i32x16();
    transmute(simd_select_bitmask(k, shf, zero))
}

/// Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_srl_epi32&expand=5487)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsrld))]
pub unsafe fn _mm256_mask_srl_epi32(
    src: __m256i,
    k: __mmask8,
    a: __m256i,
    count: __m128i,
) -> __m256i {
    let shf = _mm256_srl_epi32(a, count).as_i32x8();
    transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
}

/// Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_srl_epi32&expand=5488)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsrld))]
pub unsafe fn _mm256_maskz_srl_epi32(k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
    let shf = _mm256_srl_epi32(a, count).as_i32x8();
    let zero = _mm256_setzero_si256().as_i32x8();
    transmute(simd_select_bitmask(k, shf, zero))
}

/// Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_srl_epi32&expand=5484)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsrld))]
pub unsafe fn _mm_mask_srl_epi32(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
    let shf = _mm_srl_epi32(a, count).as_i32x4();
    transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
}

/// Shift packed 32-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_srl_epi32&expand=5485)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsrld))]
pub unsafe fn _mm_maskz_srl_epi32(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
    let shf = _mm_srl_epi32(a, count).as_i32x4();
    let zero = _mm_setzero_si128().as_i32x4();
    transmute(simd_select_bitmask(k, shf, zero))
}

/// Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sll_epi64&expand=5289)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpsllq))]
pub unsafe fn _mm512_sll_epi64(a: __m512i, count: __m128i) -> __m512i {
    transmute(vpsllq(a.as_i64x8(), count.as_i64x2()))
}

/// Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sll_epi64&expand=5287)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpsllq))]
pub unsafe fn _mm512_mask_sll_epi64(
    src: __m512i,
    k: __mmask8,
    a: __m512i,
    count: __m128i,
) -> __m512i {
    let shf = _mm512_sll_epi64(a, count).as_i64x8();
    transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
}

/// Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sll_epi64&expand=5288)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpsllq))]
pub unsafe fn _mm512_maskz_sll_epi64(k: __mmask8, a: __m512i, count: __m128i) -> __m512i {
    let shf = _mm512_sll_epi64(a, count).as_i64x8();
    let zero = _mm512_setzero_si512().as_i64x8();
    transmute(simd_select_bitmask(k, shf, zero))
}

/// Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_sll_epi64&expand=5284)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsllq))]
pub unsafe fn _mm256_mask_sll_epi64(
    src: __m256i,
    k: __mmask8,
    a: __m256i,
    count: __m128i,
) -> __m256i {
    let shf = _mm256_sll_epi64(a, count).as_i64x4();
    transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
}

/// Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_sll_epi64&expand=5285)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsllq))]
pub unsafe fn _mm256_maskz_sll_epi64(k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
    let shf = _mm256_sll_epi64(a, count).as_i64x4();
    let zero = _mm256_setzero_si256().as_i64x4();
    transmute(simd_select_bitmask(k, shf, zero))
}

/// Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sll_epi64&expand=5281)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsllq))]
pub unsafe fn _mm_mask_sll_epi64(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
    let shf = _mm_sll_epi64(a, count).as_i64x2();
    transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
}

/// Shift packed 64-bit integers in a left by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sll_epi64&expand=5282)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsllq))]
pub unsafe fn _mm_maskz_sll_epi64(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
    let shf = _mm_sll_epi64(a, count).as_i64x2();
    let zero = _mm_setzero_si128().as_i64x2();
    transmute(simd_select_bitmask(k, shf, zero))
}

/// Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_srl_epi64&expand=5501)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpsrlq))]
pub unsafe fn _mm512_srl_epi64(a: __m512i, count: __m128i) -> __m512i {
    transmute(vpsrlq(a.as_i64x8(), count.as_i64x2()))
}

/// Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_srl_epi64&expand=5499)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpsrlq))]
pub unsafe fn _mm512_mask_srl_epi64(
    src: __m512i,
    k: __mmask8,
    a: __m512i,
    count: __m128i,
) -> __m512i {
    let shf = _mm512_srl_epi64(a, count).as_i64x8();
    transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
}

/// Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_srl_epi64&expand=5500)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpsrlq))]
pub unsafe fn _mm512_maskz_srl_epi64(k: __mmask8, a: __m512i, count: __m128i) -> __m512i {
    let shf = _mm512_srl_epi64(a, count).as_i64x8();
    let zero = _mm512_setzero_si512().as_i64x8();
    transmute(simd_select_bitmask(k, shf, zero))
}

/// Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_srl_epi64&expand=5496)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsrlq))]
pub unsafe fn _mm256_mask_srl_epi64(
    src: __m256i,
    k: __mmask8,
    a: __m256i,
    count: __m128i,
) -> __m256i {
    let shf = _mm256_srl_epi64(a, count).as_i64x4();
    transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
}

/// Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_srl_epi64&expand=5497)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsrlq))]
pub unsafe fn _mm256_maskz_srl_epi64(k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
    let shf = _mm256_srl_epi64(a, count).as_i64x4();
    let zero = _mm256_setzero_si256().as_i64x4();
    transmute(simd_select_bitmask(k, shf, zero))
}

/// Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_srl_epi64&expand=5493)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsrlq))]
pub unsafe fn _mm_mask_srl_epi64(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
    let shf = _mm_srl_epi64(a, count).as_i64x2();
    transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
}

/// Shift packed 64-bit integers in a right by count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_srl_epi64&expand=5494)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsrlq))]
pub unsafe fn _mm_maskz_srl_epi64(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
    let shf = _mm_srl_epi64(a, count).as_i64x2();
    let zero = _mm_setzero_si128().as_i64x2();
    transmute(simd_select_bitmask(k, shf, zero))
}

/// Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sra_epi32&expand=5407)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpsrad))]
pub unsafe fn _mm512_sra_epi32(a: __m512i, count: __m128i) -> __m512i {
    transmute(vpsrad(a.as_i32x16(), count.as_i32x4()))
}

/// Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sra_epi32&expand=5405)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpsrad))]
pub unsafe fn _mm512_mask_sra_epi32(
    src: __m512i,
    k: __mmask16,
    a: __m512i,
    count: __m128i,
) -> __m512i {
    let shf = _mm512_sra_epi32(a, count).as_i32x16();
    transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
}

/// Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sra_epi32&expand=5406)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpsrad))]
pub unsafe fn _mm512_maskz_sra_epi32(k: __mmask16, a: __m512i, count: __m128i) -> __m512i {
    let shf = _mm512_sra_epi32(a, count).as_i32x16();
    let zero = _mm512_setzero_si512().as_i32x16();
    transmute(simd_select_bitmask(k, shf, zero))
}

/// Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_sra_epi32&expand=5402)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsrad))]
pub unsafe fn _mm256_mask_sra_epi32(
    src: __m256i,
    k: __mmask8,
    a: __m256i,
    count: __m128i,
) -> __m256i {
    let shf = _mm256_sra_epi32(a, count).as_i32x8();
    transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
}

/// Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_sra_epi32&expand=5403)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsrad))]
pub unsafe fn _mm256_maskz_sra_epi32(k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
    let shf = _mm256_sra_epi32(a, count).as_i32x8();
    let zero = _mm256_setzero_si256().as_i32x8();
    transmute(simd_select_bitmask(k, shf, zero))
}

/// Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sra_epi32&expand=5399)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsrad))]
pub unsafe fn _mm_mask_sra_epi32(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
    let shf = _mm_sra_epi32(a, count).as_i32x4();
    transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
}

/// Shift packed 32-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sra_epi32&expand=5400)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsrad))]
pub unsafe fn _mm_maskz_sra_epi32(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
    let shf = _mm_sra_epi32(a, count).as_i32x4();
    let zero = _mm_setzero_si128().as_i32x4();
    transmute(simd_select_bitmask(k, shf, zero))
}

/// Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sra_epi64&expand=5416)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpsraq))]
pub unsafe fn _mm512_sra_epi64(a: __m512i, count: __m128i) -> __m512i {
    transmute(vpsraq(a.as_i64x8(), count.as_i64x2()))
}

/// Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sra_epi64&expand=5414)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpsraq))]
pub unsafe fn _mm512_mask_sra_epi64(
    src: __m512i,
    k: __mmask8,
    a: __m512i,
    count: __m128i,
) -> __m512i {
    let shf = _mm512_sra_epi64(a, count).as_i64x8();
    transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
}

/// Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sra_epi64&expand=5415)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpsraq))]
pub unsafe fn _mm512_maskz_sra_epi64(k: __mmask8, a: __m512i, count: __m128i) -> __m512i {
    let shf = _mm512_sra_epi64(a, count).as_i64x8();
    let zero = _mm512_setzero_si512().as_i64x8();
    transmute(simd_select_bitmask(k, shf, zero))
}

/// Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_sra_epi64&expand=5413)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsraq))]
pub unsafe fn _mm256_sra_epi64(a: __m256i, count: __m128i) -> __m256i {
    transmute(vpsraq256(a.as_i64x4(), count.as_i64x2()))
}

/// Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_sra_epi64&expand=5411)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsraq))]
pub unsafe fn _mm256_mask_sra_epi64(
    src: __m256i,
    k: __mmask8,
    a: __m256i,
    count: __m128i,
) -> __m256i {
    let shf = _mm256_sra_epi64(a, count).as_i64x4();
    transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
}

/// Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_sra_epi64&expand=5412)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsraq))]
pub unsafe fn _mm256_maskz_sra_epi64(k: __mmask8, a: __m256i, count: __m128i) -> __m256i {
    let shf = _mm256_sra_epi64(a, count).as_i64x4();
    let zero = _mm256_setzero_si256().as_i64x4();
    transmute(simd_select_bitmask(k, shf, zero))
}

/// Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sra_epi64&expand=5410)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsraq))]
pub unsafe fn _mm_sra_epi64(a: __m128i, count: __m128i) -> __m128i {
    transmute(vpsraq128(a.as_i64x2(), count.as_i64x2()))
}

/// Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sra_epi64&expand=5408)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsraq))]
pub unsafe fn _mm_mask_sra_epi64(src: __m128i, k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
    let shf = _mm_sra_epi64(a, count).as_i64x2();
    transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
}

/// Shift packed 64-bit integers in a right by count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sra_epi64&expand=5409)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsraq))]
pub unsafe fn _mm_maskz_sra_epi64(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
    let shf = _mm_sra_epi64(a, count).as_i64x2();
    let zero = _mm_setzero_si128().as_i64x2();
    transmute(simd_select_bitmask(k, shf, zero))
}

/// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_srai_epi32&expand=5436)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 1))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm512_srai_epi32<const IMM8: u32>(a: __m512i) -> __m512i {
    static_assert_imm_u8!(IMM8);
    let a = a.as_i32x16();
    let r = vpsraid512(a, IMM8);
    transmute(r)
}

/// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_srai_epi32&expand=5434)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 1))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_mask_srai_epi32<const IMM8: u32>(
    src: __m512i,
    k: __mmask16,
    a: __m512i,
) -> __m512i {
    static_assert_imm_u8!(IMM8);
    let a = a.as_i32x16();
    let r = vpsraid512(a, IMM8);
    transmute(simd_select_bitmask(k, r, src.as_i32x16()))
}

/// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_srai_epi32&expand=5435)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 1))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_maskz_srai_epi32<const IMM8: u32>(k: __mmask16, a: __m512i) -> __m512i {
    static_assert_imm_u8!(IMM8);
    let a = a.as_i32x16();
    let r = vpsraid512(a, IMM8);
    let zero = _mm512_setzero_si512().as_i32x16();
    transmute(simd_select_bitmask(k, r, zero))
}

/// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_srai_epi32&expand=5431)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 1))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm256_mask_srai_epi32<const IMM8: u32>(
    src: __m256i,
    k: __mmask8,
    a: __m256i,
) -> __m256i {
    let imm8 = IMM8 as i32;
    let r = psraid256(a.as_i32x8(), imm8);
    transmute(simd_select_bitmask(k, r, src.as_i32x8()))
}

/// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_srai_epi32&expand=5432)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 1))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm256_maskz_srai_epi32<const IMM8: u32>(k: __mmask8, a: __m256i) -> __m256i {
    let imm8 = IMM8 as i32;
    let r = psraid256(a.as_i32x8(), imm8);
    let zero = _mm256_setzero_si256().as_i32x8();
    transmute(simd_select_bitmask(k, r, zero))
}

/// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_srai_epi32&expand=5428)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 1))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm_mask_srai_epi32<const IMM8: u32>(
    src: __m128i,
    k: __mmask8,
    a: __m128i,
) -> __m128i {
    let imm8 = IMM8 as i32;
    let r = psraid128(a.as_i32x4(), imm8);
    transmute(simd_select_bitmask(k, r, src.as_i32x4()))
}

/// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_srai_epi32&expand=5429)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 1))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm_maskz_srai_epi32<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
    let imm8 = IMM8 as i32;
    let r = psraid128(a.as_i32x4(), imm8);
    let zero = _mm_setzero_si128().as_i32x4();
    transmute(simd_select_bitmask(k, r, zero))
}

/// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_srai_epi64&expand=5445)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpsraq, IMM8 = 1))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm512_srai_epi64<const IMM8: u32>(a: __m512i) -> __m512i {
    static_assert_imm_u8!(IMM8);
    let a = a.as_i64x8();
    let r = vpsraiq(a, IMM8);
    transmute(r)
}

/// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_srai_epi64&expand=5443)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpsraq, IMM8 = 1))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_mask_srai_epi64<const IMM8: u32>(
    src: __m512i,
    k: __mmask8,
    a: __m512i,
) -> __m512i {
    static_assert_imm_u8!(IMM8);
    let a = a.as_i64x8();
    let shf = vpsraiq(a, IMM8);
    transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
}

/// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_srai_epi64&expand=5444)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpsraq, IMM8 = 1))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_maskz_srai_epi64<const IMM8: u32>(k: __mmask8, a: __m512i) -> __m512i {
    static_assert_imm_u8!(IMM8);
    let a = a.as_i64x8();
    let shf = vpsraiq(a, IMM8);
    let zero = _mm512_setzero_si512().as_i64x8();
    transmute(simd_select_bitmask(k, shf, zero))
}

/// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srai_epi64&expand=5442)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsraq, IMM8 = 1))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm256_srai_epi64<const IMM8: u32>(a: __m256i) -> __m256i {
    static_assert_imm_u8!(IMM8);
    let a = a.as_i64x4();
    let r = vpsraiq256(a, IMM8);
    transmute(r)
}

/// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_srai_epi64&expand=5440)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsraq, IMM8 = 1))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm256_mask_srai_epi64<const IMM8: u32>(
    src: __m256i,
    k: __mmask8,
    a: __m256i,
) -> __m256i {
    static_assert_imm_u8!(IMM8);
    let a = a.as_i64x4();
    let shf = vpsraiq256(a, IMM8);
    transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
}

/// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_srai_epi64&expand=5441)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsraq, IMM8 = 1))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm256_maskz_srai_epi64<const IMM8: u32>(k: __mmask8, a: __m256i) -> __m256i {
    static_assert_imm_u8!(IMM8);
    let a = a.as_i64x4();
    let shf = vpsraiq256(a, IMM8);
    let zero = _mm256_setzero_si256().as_i64x4();
    transmute(simd_select_bitmask(k, shf, zero))
}

/// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi64&expand=5439)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsraq, IMM8 = 1))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm_srai_epi64<const IMM8: u32>(a: __m128i) -> __m128i {
    static_assert_imm_u8!(IMM8);
    let a = a.as_i64x2();
    let r = vpsraiq128(a, IMM8);
    transmute(r)
}

/// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_srai_epi64&expand=5437)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsraq, IMM8 = 1))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm_mask_srai_epi64<const IMM8: u32>(
    src: __m128i,
    k: __mmask8,
    a: __m128i,
) -> __m128i {
    static_assert_imm_u8!(IMM8);
    let a = a.as_i64x2();
    let shf = vpsraiq128(a, IMM8);
    transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
}

/// Shift packed 64-bit integers in a right by imm8 while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_srai_epi64&expand=5438)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsraq, IMM8 = 1))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm_maskz_srai_epi64<const IMM8: u32>(k: __mmask8, a: __m128i) -> __m128i {
    static_assert_imm_u8!(IMM8);
    let a = a.as_i64x2();
    let shf = vpsraiq128(a, IMM8);
    let zero = _mm_setzero_si128().as_i64x2();
    transmute(simd_select_bitmask(k, shf, zero))
}

/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_srav_epi32&expand=5465)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpsravd))]
pub unsafe fn _mm512_srav_epi32(a: __m512i, count: __m512i) -> __m512i {
    transmute(vpsravd(a.as_i32x16(), count.as_i32x16()))
}

/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_srav_epi32&expand=5463)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpsravd))]
pub unsafe fn _mm512_mask_srav_epi32(
    src: __m512i,
    k: __mmask16,
    a: __m512i,
    count: __m512i,
) -> __m512i {
    let shf = _mm512_srav_epi32(a, count).as_i32x16();
    transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
}

/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_srav_epi32&expand=5464)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpsravd))]
pub unsafe fn _mm512_maskz_srav_epi32(k: __mmask16, a: __m512i, count: __m512i) -> __m512i {
    let shf = _mm512_srav_epi32(a, count).as_i32x16();
    let zero = _mm512_setzero_si512().as_i32x16();
    transmute(simd_select_bitmask(k, shf, zero))
}

/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_srav_epi32&expand=5460)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsravd))]
pub unsafe fn _mm256_mask_srav_epi32(
    src: __m256i,
    k: __mmask8,
    a: __m256i,
    count: __m256i,
) -> __m256i {
    let shf = _mm256_srav_epi32(a, count).as_i32x8();
    transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
}

/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_srav_epi32&expand=5461)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsravd))]
pub unsafe fn _mm256_maskz_srav_epi32(k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
    let shf = _mm256_srav_epi32(a, count).as_i32x8();
    let zero = _mm256_setzero_si256().as_i32x8();
    transmute(simd_select_bitmask(k, shf, zero))
}

/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_srav_epi32&expand=5457)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsravd))]
pub unsafe fn _mm_mask_srav_epi32(
    src: __m128i,
    k: __mmask8,
    a: __m128i,
    count: __m128i,
) -> __m128i {
    let shf = _mm_srav_epi32(a, count).as_i32x4();
    transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
}

/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_srav_epi32&expand=5458)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsravd))]
pub unsafe fn _mm_maskz_srav_epi32(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
    let shf = _mm_srav_epi32(a, count).as_i32x4();
    let zero = _mm_setzero_si128().as_i32x4();
    transmute(simd_select_bitmask(k, shf, zero))
}

/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_srav_epi64&expand=5474)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpsravq))]
pub unsafe fn _mm512_srav_epi64(a: __m512i, count: __m512i) -> __m512i {
    transmute(vpsravq(a.as_i64x8(), count.as_i64x8()))
}

/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_srav_epi64&expand=5472)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpsravq))]
pub unsafe fn _mm512_mask_srav_epi64(
    src: __m512i,
    k: __mmask8,
    a: __m512i,
    count: __m512i,
) -> __m512i {
    let shf = _mm512_srav_epi64(a, count).as_i64x8();
    transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
}

/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_srav_epi64&expand=5473)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpsravq))]
pub unsafe fn _mm512_maskz_srav_epi64(k: __mmask8, a: __m512i, count: __m512i) -> __m512i {
    let shf = _mm512_srav_epi64(a, count).as_i64x8();
    let zero = _mm512_setzero_si512().as_i64x8();
    transmute(simd_select_bitmask(k, shf, zero))
}

/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_srav_epi64&expand=5471)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsravq))]
pub unsafe fn _mm256_srav_epi64(a: __m256i, count: __m256i) -> __m256i {
    transmute(vpsravq256(a.as_i64x4(), count.as_i64x4()))
}

/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_srav_epi64&expand=5469)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsravq))]
pub unsafe fn _mm256_mask_srav_epi64(
    src: __m256i,
    k: __mmask8,
    a: __m256i,
    count: __m256i,
) -> __m256i {
    let shf = _mm256_srav_epi64(a, count).as_i64x4();
    transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
}

/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_srav_epi64&expand=5470)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsravq))]
pub unsafe fn _mm256_maskz_srav_epi64(k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
    let shf = _mm256_srav_epi64(a, count).as_i64x4();
    let zero = _mm256_setzero_si256().as_i64x4();
    transmute(simd_select_bitmask(k, shf, zero))
}

/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srav_epi64&expand=5468)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsravq))]
pub unsafe fn _mm_srav_epi64(a: __m128i, count: __m128i) -> __m128i {
    transmute(vpsravq128(a.as_i64x2(), count.as_i64x2()))
}

/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_srav_epi64&expand=5466)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsravq))]
pub unsafe fn _mm_mask_srav_epi64(
    src: __m128i,
    k: __mmask8,
    a: __m128i,
    count: __m128i,
) -> __m128i {
    let shf = _mm_srav_epi64(a, count).as_i64x2();
    transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
}

/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in sign bits, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_srav_epi64&expand=5467)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsravq))]
pub unsafe fn _mm_maskz_srav_epi64(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
    let shf = _mm_srav_epi64(a, count).as_i64x2();
    let zero = _mm_setzero_si128().as_i64x2();
    transmute(simd_select_bitmask(k, shf, zero))
}

/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rolv_epi32&expand=4703)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vprolvd))]
pub unsafe fn _mm512_rolv_epi32(a: __m512i, b: __m512i) -> __m512i {
    transmute(vprolvd(a.as_i32x16(), b.as_i32x16()))
}

/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rolv_epi32&expand=4701)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vprolvd))]
pub unsafe fn _mm512_mask_rolv_epi32(
    src: __m512i,
    k: __mmask16,
    a: __m512i,
    b: __m512i,
) -> __m512i {
    let rol = _mm512_rolv_epi32(a, b).as_i32x16();
    transmute(simd_select_bitmask(k, rol, src.as_i32x16()))
}

/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rolv_epi32&expand=4702)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vprolvd))]
pub unsafe fn _mm512_maskz_rolv_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
    let rol = _mm512_rolv_epi32(a, b).as_i32x16();
    let zero = _mm512_setzero_si512().as_i32x16();
    transmute(simd_select_bitmask(k, rol, zero))
}

/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_rolv_epi32&expand=4700)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vprolvd))]
pub unsafe fn _mm256_rolv_epi32(a: __m256i, b: __m256i) -> __m256i {
    transmute(vprolvd256(a.as_i32x8(), b.as_i32x8()))
}

/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_rolv_epi3&expand=4698)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vprolvd))]
pub unsafe fn _mm256_mask_rolv_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
    let rol = _mm256_rolv_epi32(a, b).as_i32x8();
    transmute(simd_select_bitmask(k, rol, src.as_i32x8()))
}

/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_rolv_epi32&expand=4699)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vprolvd))]
pub unsafe fn _mm256_maskz_rolv_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
    let rol = _mm256_rolv_epi32(a, b).as_i32x8();
    let zero = _mm256_setzero_si256().as_i32x8();
    transmute(simd_select_bitmask(k, rol, zero))
}

/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rolv_epi32&expand=4697)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vprolvd))]
pub unsafe fn _mm_rolv_epi32(a: __m128i, b: __m128i) -> __m128i {
    transmute(vprolvd128(a.as_i32x4(), b.as_i32x4()))
}

/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rolv_epi32&expand=4695)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vprolvd))]
pub unsafe fn _mm_mask_rolv_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    let rol = _mm_rolv_epi32(a, b).as_i32x4();
    transmute(simd_select_bitmask(k, rol, src.as_i32x4()))
}

/// Rotate the bits in each packed 32-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rolv_epi32&expand=4696)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vprolvd))]
pub unsafe fn _mm_maskz_rolv_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    let rol = _mm_rolv_epi32(a, b).as_i32x4();
    let zero = _mm_setzero_si128().as_i32x4();
    transmute(simd_select_bitmask(k, rol, zero))
}

/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rorv_epi32&expand=4739)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vprorvd))]
pub unsafe fn _mm512_rorv_epi32(a: __m512i, b: __m512i) -> __m512i {
    transmute(vprorvd(a.as_i32x16(), b.as_i32x16()))
}

/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rorv_epi32&expand=4737)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vprorvd))]
pub unsafe fn _mm512_mask_rorv_epi32(
    src: __m512i,
    k: __mmask16,
    a: __m512i,
    b: __m512i,
) -> __m512i {
    let ror = _mm512_rorv_epi32(a, b).as_i32x16();
    transmute(simd_select_bitmask(k, ror, src.as_i32x16()))
}

/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rorv_epi32&expand=4738)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vprorvd))]
pub unsafe fn _mm512_maskz_rorv_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
    let ror = _mm512_rorv_epi32(a, b).as_i32x16();
    let zero = _mm512_setzero_si512().as_i32x16();
    transmute(simd_select_bitmask(k, ror, zero))
}

/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_rorv_epi32&expand=4736)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vprorvd))]
pub unsafe fn _mm256_rorv_epi32(a: __m256i, b: __m256i) -> __m256i {
    transmute(vprorvd256(a.as_i32x8(), b.as_i32x8()))
}

/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_rorv_epi32&expand=4734)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vprorvd))]
pub unsafe fn _mm256_mask_rorv_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
    let ror = _mm256_rorv_epi32(a, b).as_i32x8();
    transmute(simd_select_bitmask(k, ror, src.as_i32x8()))
}

/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_rorv_epi32&expand=4735)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vprorvd))]
pub unsafe fn _mm256_maskz_rorv_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
    let ror = _mm256_rorv_epi32(a, b).as_i32x8();
    let zero = _mm256_setzero_si256().as_i32x8();
    transmute(simd_select_bitmask(k, ror, zero))
}

/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rorv_epi32&expand=4733)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vprorvd))]
pub unsafe fn _mm_rorv_epi32(a: __m128i, b: __m128i) -> __m128i {
    transmute(vprorvd128(a.as_i32x4(), b.as_i32x4()))
}

/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rorv_epi32&expand=4731)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vprorvd))]
pub unsafe fn _mm_mask_rorv_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    let ror = _mm_rorv_epi32(a, b).as_i32x4();
    transmute(simd_select_bitmask(k, ror, src.as_i32x4()))
}

/// Rotate the bits in each packed 32-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rorv_epi32&expand=4732)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vprorvd))]
pub unsafe fn _mm_maskz_rorv_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    let ror = _mm_rorv_epi32(a, b).as_i32x4();
    let zero = _mm_setzero_si128().as_i32x4();
    transmute(simd_select_bitmask(k, ror, zero))
}

/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rolv_epi64&expand=4712)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vprolvq))]
pub unsafe fn _mm512_rolv_epi64(a: __m512i, b: __m512i) -> __m512i {
    transmute(vprolvq(a.as_i64x8(), b.as_i64x8()))
}

/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rolv_epi64&expand=4710)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vprolvq))]
pub unsafe fn _mm512_mask_rolv_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
    let rol = _mm512_rolv_epi64(a, b).as_i64x8();
    transmute(simd_select_bitmask(k, rol, src.as_i64x8()))
}

/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rolv_epi64&expand=4711)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vprolvq))]
pub unsafe fn _mm512_maskz_rolv_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
    let rol = _mm512_rolv_epi64(a, b).as_i64x8();
    let zero = _mm512_setzero_si512().as_i64x8();
    transmute(simd_select_bitmask(k, rol, zero))
}

/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_rolv_epi64&expand=4709)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vprolvq))]
pub unsafe fn _mm256_rolv_epi64(a: __m256i, b: __m256i) -> __m256i {
    transmute(vprolvq256(a.as_i64x4(), b.as_i64x4()))
}

/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_rolv_epi64&expand=4707)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vprolvq))]
pub unsafe fn _mm256_mask_rolv_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
    let rol = _mm256_rolv_epi64(a, b).as_i64x4();
    transmute(simd_select_bitmask(k, rol, src.as_i64x4()))
}

/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_rolv_epi64&expand=4708)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vprolvq))]
pub unsafe fn _mm256_maskz_rolv_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
    let rol = _mm256_rolv_epi64(a, b).as_i64x4();
    let zero = _mm256_setzero_si256().as_i64x4();
    transmute(simd_select_bitmask(k, rol, zero))
}

/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rolv_epi64&expand=4706)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vprolvq))]
pub unsafe fn _mm_rolv_epi64(a: __m128i, b: __m128i) -> __m128i {
    transmute(vprolvq128(a.as_i64x2(), b.as_i64x2()))
}

/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rolv_epi64&expand=4704)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vprolvq))]
pub unsafe fn _mm_mask_rolv_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    let rol = _mm_rolv_epi64(a, b).as_i64x2();
    transmute(simd_select_bitmask(k, rol, src.as_i64x2()))
}

/// Rotate the bits in each packed 64-bit integer in a to the left by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rolv_epi64&expand=4705)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vprolvq))]
pub unsafe fn _mm_maskz_rolv_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    let rol = _mm_rolv_epi64(a, b).as_i64x2();
    let zero = _mm_setzero_si128().as_i64x2();
    transmute(simd_select_bitmask(k, rol, zero))
}

/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_rorv_epi64&expand=4748)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vprorvq))]
pub unsafe fn _mm512_rorv_epi64(a: __m512i, b: __m512i) -> __m512i {
    transmute(vprorvq(a.as_i64x8(), b.as_i64x8()))
}

/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_rorv_epi64&expand=4746)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vprorvq))]
pub unsafe fn _mm512_mask_rorv_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
    let ror = _mm512_rorv_epi64(a, b).as_i64x8();
    transmute(simd_select_bitmask(k, ror, src.as_i64x8()))
}

/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_rorv_epi64&expand=4747)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vprorvq))]
pub unsafe fn _mm512_maskz_rorv_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
    let ror = _mm512_rorv_epi64(a, b).as_i64x8();
    let zero = _mm512_setzero_si512().as_i64x8();
    transmute(simd_select_bitmask(k, ror, zero))
}

/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_rorv_epi64&expand=4745)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vprorvq))]
pub unsafe fn _mm256_rorv_epi64(a: __m256i, b: __m256i) -> __m256i {
    transmute(vprorvq256(a.as_i64x4(), b.as_i64x4()))
}

/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_rorv_epi64&expand=4743)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vprorvq))]
pub unsafe fn _mm256_mask_rorv_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
    let ror = _mm256_rorv_epi64(a, b).as_i64x4();
    transmute(simd_select_bitmask(k, ror, src.as_i64x4()))
}

/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_rorv_epi64&expand=4744)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vprorvq))]
pub unsafe fn _mm256_maskz_rorv_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
    let ror = _mm256_rorv_epi64(a, b).as_i64x4();
    let zero = _mm256_setzero_si256().as_i64x4();
    transmute(simd_select_bitmask(k, ror, zero))
}

/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rorv_epi64&expand=4742)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vprorvq))]
pub unsafe fn _mm_rorv_epi64(a: __m128i, b: __m128i) -> __m128i {
    transmute(vprorvq128(a.as_i64x2(), b.as_i64x2()))
}

/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_rorv_epi64&expand=4740)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vprorvq))]
pub unsafe fn _mm_mask_rorv_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    let ror = _mm_rorv_epi64(a, b).as_i64x2();
    transmute(simd_select_bitmask(k, ror, src.as_i64x2()))
}

/// Rotate the bits in each packed 64-bit integer in a to the right by the number of bits specified in the corresponding element of b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_rorv_epi64&expand=4741)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vprorvq))]
pub unsafe fn _mm_maskz_rorv_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    let ror = _mm_rorv_epi64(a, b).as_i64x2();
    let zero = _mm_setzero_si128().as_i64x2();
    transmute(simd_select_bitmask(k, ror, zero))
}

/// Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sllv_epi32&expand=5342)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpsllvd))]
pub unsafe fn _mm512_sllv_epi32(a: __m512i, count: __m512i) -> __m512i {
    transmute(vpsllvd(a.as_i32x16(), count.as_i32x16()))
}

/// Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sllv_epi32&expand=5340)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpsllvd))]
pub unsafe fn _mm512_mask_sllv_epi32(
    src: __m512i,
    k: __mmask16,
    a: __m512i,
    count: __m512i,
) -> __m512i {
    let shf = _mm512_sllv_epi32(a, count).as_i32x16();
    transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
}

/// Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sllv_epi32&expand=5341)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpsllvd))]
pub unsafe fn _mm512_maskz_sllv_epi32(k: __mmask16, a: __m512i, count: __m512i) -> __m512i {
    let shf = _mm512_sllv_epi32(a, count).as_i32x16();
    let zero = _mm512_setzero_si512().as_i32x16();
    transmute(simd_select_bitmask(k, shf, zero))
}

/// Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_sllv_epi32&expand=5337)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsllvd))]
pub unsafe fn _mm256_mask_sllv_epi32(
    src: __m256i,
    k: __mmask8,
    a: __m256i,
    count: __m256i,
) -> __m256i {
    let shf = _mm256_sllv_epi32(a, count).as_i32x8();
    transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
}

/// Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_sllv_epi32&expand=5338)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsllvd))]
pub unsafe fn _mm256_maskz_sllv_epi32(k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
    let shf = _mm256_sllv_epi32(a, count).as_i32x8();
    let zero = _mm256_setzero_si256().as_i32x8();
    transmute(simd_select_bitmask(k, shf, zero))
}

/// Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sllv_epi32&expand=5334)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsllvd))]
pub unsafe fn _mm_mask_sllv_epi32(
    src: __m128i,
    k: __mmask8,
    a: __m128i,
    count: __m128i,
) -> __m128i {
    let shf = _mm_sllv_epi32(a, count).as_i32x4();
    transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
}

/// Shift packed 32-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sllv_epi32&expand=5335)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsllvd))]
pub unsafe fn _mm_maskz_sllv_epi32(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
    let shf = _mm_sllv_epi32(a, count).as_i32x4();
    let zero = _mm_setzero_si128().as_i32x4();
    transmute(simd_select_bitmask(k, shf, zero))
}

/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_srlv_epi32&expand=5554)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpsrlvd))]
pub unsafe fn _mm512_srlv_epi32(a: __m512i, count: __m512i) -> __m512i {
    transmute(vpsrlvd(a.as_i32x16(), count.as_i32x16()))
}

/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_srlv_epi32&expand=5552)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpsrlvd))]
pub unsafe fn _mm512_mask_srlv_epi32(
    src: __m512i,
    k: __mmask16,
    a: __m512i,
    count: __m512i,
) -> __m512i {
    let shf = _mm512_srlv_epi32(a, count).as_i32x16();
    transmute(simd_select_bitmask(k, shf, src.as_i32x16()))
}

/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_srlv_epi32&expand=5553)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpsrlvd))]
pub unsafe fn _mm512_maskz_srlv_epi32(k: __mmask16, a: __m512i, count: __m512i) -> __m512i {
    let shf = _mm512_srlv_epi32(a, count).as_i32x16();
    let zero = _mm512_setzero_si512().as_i32x16();
    transmute(simd_select_bitmask(k, shf, zero))
}

/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_srlv_epi32&expand=5549)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsrlvd))]
pub unsafe fn _mm256_mask_srlv_epi32(
    src: __m256i,
    k: __mmask8,
    a: __m256i,
    count: __m256i,
) -> __m256i {
    let shf = _mm256_srlv_epi32(a, count).as_i32x8();
    transmute(simd_select_bitmask(k, shf, src.as_i32x8()))
}

/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_srlv_epi32&expand=5550)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsrlvd))]
pub unsafe fn _mm256_maskz_srlv_epi32(k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
    let shf = _mm256_srlv_epi32(a, count).as_i32x8();
    let zero = _mm256_setzero_si256().as_i32x8();
    transmute(simd_select_bitmask(k, shf, zero))
}

/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_srlv_epi32&expand=5546)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsrlvd))]
pub unsafe fn _mm_mask_srlv_epi32(
    src: __m128i,
    k: __mmask8,
    a: __m128i,
    count: __m128i,
) -> __m128i {
    let shf = _mm_srlv_epi32(a, count).as_i32x4();
    transmute(simd_select_bitmask(k, shf, src.as_i32x4()))
}

/// Shift packed 32-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_srlv_epi32&expand=5547)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsrlvd))]
pub unsafe fn _mm_maskz_srlv_epi32(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
    let shf = _mm_srlv_epi32(a, count).as_i32x4();
    let zero = _mm_setzero_si128().as_i32x4();
    transmute(simd_select_bitmask(k, shf, zero))
}

/// Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_sllv_epi64&expand=5351)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpsllvq))]
pub unsafe fn _mm512_sllv_epi64(a: __m512i, count: __m512i) -> __m512i {
    transmute(vpsllvq(a.as_i64x8(), count.as_i64x8()))
}

/// Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_sllv_epi64&expand=5349)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpsllvq))]
pub unsafe fn _mm512_mask_sllv_epi64(
    src: __m512i,
    k: __mmask8,
    a: __m512i,
    count: __m512i,
) -> __m512i {
    let shf = _mm512_sllv_epi64(a, count).as_i64x8();
    transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
}

/// Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_sllv_epi64&expand=5350)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpsllvq))]
pub unsafe fn _mm512_maskz_sllv_epi64(k: __mmask8, a: __m512i, count: __m512i) -> __m512i {
    let shf = _mm512_sllv_epi64(a, count).as_i64x8();
    let zero = _mm512_setzero_si512().as_i64x8();
    transmute(simd_select_bitmask(k, shf, zero))
}

/// Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_sllv_epi64&expand=5346)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsllvq))]
pub unsafe fn _mm256_mask_sllv_epi64(
    src: __m256i,
    k: __mmask8,
    a: __m256i,
    count: __m256i,
) -> __m256i {
    let shf = _mm256_sllv_epi64(a, count).as_i64x4();
    transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
}

/// Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_sllv_epi64&expand=5347)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsllvq))]
pub unsafe fn _mm256_maskz_sllv_epi64(k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
    let shf = _mm256_sllv_epi64(a, count).as_i64x4();
    let zero = _mm256_setzero_si256().as_i64x4();
    transmute(simd_select_bitmask(k, shf, zero))
}

/// Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_sllv_epi64&expand=5343)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsllvq))]
pub unsafe fn _mm_mask_sllv_epi64(
    src: __m128i,
    k: __mmask8,
    a: __m128i,
    count: __m128i,
) -> __m128i {
    let shf = _mm_sllv_epi64(a, count).as_i64x2();
    transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
}

/// Shift packed 64-bit integers in a left by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_sllv_epi64&expand=5344)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsllvq))]
pub unsafe fn _mm_maskz_sllv_epi64(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
    let shf = _mm_sllv_epi64(a, count).as_i64x2();
    let zero = _mm_setzero_si128().as_i64x2();
    transmute(simd_select_bitmask(k, shf, zero))
}

/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_srlv_epi64&expand=5563)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpsrlvq))]
pub unsafe fn _mm512_srlv_epi64(a: __m512i, count: __m512i) -> __m512i {
    transmute(vpsrlvq(a.as_i64x8(), count.as_i64x8()))
}

/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_srlv_epi64&expand=5561)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpsrlvq))]
pub unsafe fn _mm512_mask_srlv_epi64(
    src: __m512i,
    k: __mmask8,
    a: __m512i,
    count: __m512i,
) -> __m512i {
    let shf = _mm512_srlv_epi64(a, count).as_i64x8();
    transmute(simd_select_bitmask(k, shf, src.as_i64x8()))
}

/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_srlv_epi64&expand=5562)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpsrlvq))]
pub unsafe fn _mm512_maskz_srlv_epi64(k: __mmask8, a: __m512i, count: __m512i) -> __m512i {
    let shf = _mm512_srlv_epi64(a, count).as_i64x8();
    let zero = _mm512_setzero_si512().as_i64x8();
    transmute(simd_select_bitmask(k, shf, zero))
}

/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_srlv_epi64&expand=5558)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsrlvq))]
pub unsafe fn _mm256_mask_srlv_epi64(
    src: __m256i,
    k: __mmask8,
    a: __m256i,
    count: __m256i,
) -> __m256i {
    let shf = _mm256_srlv_epi64(a, count).as_i64x4();
    transmute(simd_select_bitmask(k, shf, src.as_i64x4()))
}

/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_srlv_epi64&expand=5559)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsrlvq))]
pub unsafe fn _mm256_maskz_srlv_epi64(k: __mmask8, a: __m256i, count: __m256i) -> __m256i {
    let shf = _mm256_srlv_epi64(a, count).as_i64x4();
    let zero = _mm256_setzero_si256().as_i64x4();
    transmute(simd_select_bitmask(k, shf, zero))
}

/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_srlv_epi64&expand=5555)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsrlvq))]
pub unsafe fn _mm_mask_srlv_epi64(
    src: __m128i,
    k: __mmask8,
    a: __m128i,
    count: __m128i,
) -> __m128i {
    let shf = _mm_srlv_epi64(a, count).as_i64x2();
    transmute(simd_select_bitmask(k, shf, src.as_i64x2()))
}

/// Shift packed 64-bit integers in a right by the amount specified by the corresponding element in count while shifting in zeros, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_srlv_epi64&expand=5556)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpsrlvq))]
pub unsafe fn _mm_maskz_srlv_epi64(k: __mmask8, a: __m128i, count: __m128i) -> __m128i {
    let shf = _mm_srlv_epi64(a, count).as_i64x2();
    let zero = _mm_setzero_si128().as_i64x2();
    transmute(simd_select_bitmask(k, shf, zero))
}

/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permute_ps&expand=4170)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpermilps, MASK = 0b11_00_01_11))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm512_permute_ps<const MASK: i32>(a: __m512) -> __m512 {
    static_assert_imm8!(MASK);
    simd_shuffle16!(
        a,
        a,
        <const MASK: i32> [
            MASK as u32 & 0b11,
            (MASK as u32 >> 2) & 0b11,
            ((MASK as u32 >> 4) & 0b11),
            ((MASK as u32 >> 6) & 0b11),
            (MASK as u32 & 0b11) + 4,
            ((MASK as u32 >> 2) & 0b11) + 4,
            ((MASK as u32 >> 4) & 0b11) + 4,
            ((MASK as u32 >> 6) & 0b11) + 4,
            (MASK as u32 & 0b11) + 8,
            ((MASK as u32 >> 2) & 0b11) + 8,
            ((MASK as u32 >> 4) & 0b11) + 8,
            ((MASK as u32 >> 6) & 0b11) + 8,
            (MASK as u32 & 0b11) + 12,
            ((MASK as u32 >> 2) & 0b11) + 12,
            ((MASK as u32 >> 4) & 0b11) + 12,
            ((MASK as u32 >> 6) & 0b11) + 12,
        ],
    )
}

/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permute_ps&expand=4168)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpermilps, MASK = 0b11_00_01_11))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_mask_permute_ps<const MASK: i32>(
    src: __m512,
    k: __mmask16,
    a: __m512,
) -> __m512 {
    static_assert_imm8!(MASK);
    let r = _mm512_permute_ps::<MASK>(a);
    transmute(simd_select_bitmask(k, r.as_f32x16(), src.as_f32x16()))
}

/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permute_ps&expand=4169)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpermilps, MASK = 0b11_00_01_11))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_maskz_permute_ps<const MASK: i32>(k: __mmask16, a: __m512) -> __m512 {
    static_assert_imm8!(MASK);
    let r = _mm512_permute_ps::<MASK>(a);
    let zero = _mm512_setzero_ps().as_f32x16();
    transmute(simd_select_bitmask(k, r.as_f32x16(), zero))
}

/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_permute_ps&expand=4165)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpermilps, MASK = 0b11_00_01_11))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm256_mask_permute_ps<const MASK: i32>(
    src: __m256,
    k: __mmask8,
    a: __m256,
) -> __m256 {
    let r = _mm256_permute_ps::<MASK>(a);
    transmute(simd_select_bitmask(k, r.as_f32x8(), src.as_f32x8()))
}

/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_permute_ps&expand=4166)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpermilps, MASK = 0b11_00_01_11))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm256_maskz_permute_ps<const MASK: i32>(k: __mmask8, a: __m256) -> __m256 {
    let r = _mm256_permute_ps::<MASK>(a);
    let zero = _mm256_setzero_ps().as_f32x8();
    transmute(simd_select_bitmask(k, r.as_f32x8(), zero))
}

/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_permute_ps&expand=4162)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpermilps, MASK = 0b11_00_01_11))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm_mask_permute_ps<const MASK: i32>(src: __m128, k: __mmask8, a: __m128) -> __m128 {
    let r = _mm_permute_ps::<MASK>(a);
    transmute(simd_select_bitmask(k, r.as_f32x4(), src.as_f32x4()))
}

/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_permute_ps&expand=4163)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpermilps, MASK = 0b11_00_01_11))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm_maskz_permute_ps<const MASK: i32>(k: __mmask8, a: __m128) -> __m128 {
    let r = _mm_permute_ps::<MASK>(a);
    let zero = _mm_setzero_ps().as_f32x4();
    transmute(simd_select_bitmask(k, r.as_f32x4(), zero))
}

/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permute_pd&expand=4161)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpermilpd, MASK = 0b11_01_10_01))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm512_permute_pd<const MASK: i32>(a: __m512d) -> __m512d {
    static_assert_imm8!(MASK);
    simd_shuffle8!(
        a,
        a,
        <const MASK: i32> [
            MASK as u32 & 0b1,
            ((MASK as u32 >> 1) & 0b1),
            ((MASK as u32 >> 2) & 0b1) + 2,
            ((MASK as u32 >> 3) & 0b1) + 2,
            ((MASK as u32 >> 4) & 0b1) + 4,
            ((MASK as u32 >> 5) & 0b1) + 4,
            ((MASK as u32 >> 6) & 0b1) + 6,
            ((MASK as u32 >> 7) & 0b1) + 6,
        ],
    )
}

/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permute_pd&expand=4159)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpermilpd, MASK = 0b11_01_10_01))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_mask_permute_pd<const MASK: i32>(
    src: __m512d,
    k: __mmask8,
    a: __m512d,
) -> __m512d {
    static_assert_imm8!(MASK);
    let r = _mm512_permute_pd::<MASK>(a);
    transmute(simd_select_bitmask(k, r.as_f64x8(), src.as_f64x8()))
}

/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permute_pd&expand=4160)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpermilpd, MASK = 0b11_01_10_01))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_maskz_permute_pd<const MASK: i32>(k: __mmask8, a: __m512d) -> __m512d {
    static_assert_imm8!(MASK);
    let r = _mm512_permute_pd::<MASK>(a);
    let zero = _mm512_setzero_pd().as_f64x8();
    transmute(simd_select_bitmask(k, r.as_f64x8(), zero))
}

/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_permute_pd&expand=4156)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpermilpd, MASK = 0b11_01))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm256_mask_permute_pd<const MASK: i32>(
    src: __m256d,
    k: __mmask8,
    a: __m256d,
) -> __m256d {
    static_assert_imm4!(MASK);
    let r = _mm256_permute_pd::<MASK>(a);
    transmute(simd_select_bitmask(k, r.as_f64x4(), src.as_f64x4()))
}

/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_permute_pd&expand=4157)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpermilpd, MASK = 0b11_01))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm256_maskz_permute_pd<const MASK: i32>(k: __mmask8, a: __m256d) -> __m256d {
    static_assert_imm4!(MASK);
    let r = _mm256_permute_pd::<MASK>(a);
    let zero = _mm256_setzero_pd().as_f64x4();
    transmute(simd_select_bitmask(k, r.as_f64x4(), zero))
}

/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_permute_pd&expand=4153)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpermilpd, IMM2 = 0b01))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm_mask_permute_pd<const IMM2: i32>(
    src: __m128d,
    k: __mmask8,
    a: __m128d,
) -> __m128d {
    static_assert_imm2!(IMM2);
    let r = _mm_permute_pd::<IMM2>(a);
    transmute(simd_select_bitmask(k, r.as_f64x2(), src.as_f64x2()))
}

/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_permute_pd&expand=4154)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpermilpd, IMM2 = 0b01))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm_maskz_permute_pd<const IMM2: i32>(k: __mmask8, a: __m128d) -> __m128d {
    static_assert_imm2!(IMM2);
    let r = _mm_permute_pd::<IMM2>(a);
    let zero = _mm_setzero_pd().as_f64x2();
    transmute(simd_select_bitmask(k, r.as_f64x2(), zero))
}

/// Shuffle 64-bit integers in a within 256-bit lanes using the control in imm8, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutex_epi64&expand=4208)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermq
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm512_permutex_epi64<const MASK: i32>(a: __m512i) -> __m512i {
    static_assert_imm8!(MASK);
    simd_shuffle8!(
        a,
        a,
        <const MASK: i32> [
            MASK as u32 & 0b11,
            (MASK as u32 >> 2) & 0b11,
            ((MASK as u32 >> 4) & 0b11),
            ((MASK as u32 >> 6) & 0b11),
            (MASK as u32 & 0b11) + 4,
            ((MASK as u32 >> 2) & 0b11) + 4,
            ((MASK as u32 >> 4) & 0b11) + 4,
            ((MASK as u32 >> 6) & 0b11) + 4,
        ],
    )
}

/// Shuffle 64-bit integers in a within 256-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutex_epi64&expand=4206)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermq
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_mask_permutex_epi64<const MASK: i32>(
    src: __m512i,
    k: __mmask8,
    a: __m512i,
) -> __m512i {
    static_assert_imm8!(MASK);
    let r = _mm512_permutex_epi64::<MASK>(a);
    transmute(simd_select_bitmask(k, r.as_i64x8(), src.as_i64x8()))
}

/// Shuffle 64-bit integers in a within 256-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutex_epi64&expand=4207)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermq
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_maskz_permutex_epi64<const MASK: i32>(k: __mmask8, a: __m512i) -> __m512i {
    static_assert_imm8!(MASK);
    let r = _mm512_permutex_epi64::<MASK>(a);
    let zero = _mm512_setzero_si512().as_i64x8();
    transmute(simd_select_bitmask(k, r.as_i64x8(), zero))
}

/// Shuffle 64-bit integers in a within 256-bit lanes using the control in imm8, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutex_epi64&expand=4205)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermq
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm256_permutex_epi64<const MASK: i32>(a: __m256i) -> __m256i {
    static_assert_imm8!(MASK);
    simd_shuffle4!(
        a,
        a,
        <const MASK: i32> [
            MASK as u32 & 0b11,
            (MASK as u32 >> 2) & 0b11,
            ((MASK as u32 >> 4) & 0b11),
            ((MASK as u32 >> 6) & 0b11),
        ],
    )
}

/// Shuffle 64-bit integers in a within 256-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_permutex_epi6&expand=4203)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermq
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm256_mask_permutex_epi64<const MASK: i32>(
    src: __m256i,
    k: __mmask8,
    a: __m256i,
) -> __m256i {
    static_assert_imm8!(MASK);
    let r = _mm256_permutex_epi64::<MASK>(a);
    transmute(simd_select_bitmask(k, r.as_i64x4(), src.as_i64x4()))
}

/// Shuffle 64-bit integers in a within 256-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_permutex_epi64&expand=4204)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermq
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm256_maskz_permutex_epi64<const MASK: i32>(k: __mmask8, a: __m256i) -> __m256i {
    static_assert_imm8!(MASK);
    let r = _mm256_permutex_epi64::<MASK>(a);
    let zero = _mm256_setzero_si256().as_i64x4();
    transmute(simd_select_bitmask(k, r.as_i64x4(), zero))
}

/// Shuffle double-precision (64-bit) floating-point elements in a within 256-bit lanes using the control in imm8, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutex_pd&expand=4214)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermpd
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm512_permutex_pd<const MASK: i32>(a: __m512d) -> __m512d {
    static_assert_imm8!(MASK);
    simd_shuffle8!(
        a,
        a,
        <const MASK: i32> [
            MASK as u32 & 0b11,
            (MASK as u32 >> 2) & 0b11,
            ((MASK as u32 >> 4) & 0b11),
            ((MASK as u32 >> 6) & 0b11),
            (MASK as u32 & 0b11) + 4,
            ((MASK as u32 >> 2) & 0b11) + 4,
            ((MASK as u32 >> 4) & 0b11) + 4,
            ((MASK as u32 >> 6) & 0b11) + 4,
        ],
    )
}

/// Shuffle double-precision (64-bit) floating-point elements in a within 256-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutex_pd&expand=4212)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermpd
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_mask_permutex_pd<const MASK: i32>(
    src: __m512d,
    k: __mmask8,
    a: __m512d,
) -> __m512d {
    let r = _mm512_permutex_pd::<MASK>(a);
    transmute(simd_select_bitmask(k, r.as_f64x8(), src.as_f64x8()))
}

/// Shuffle double-precision (64-bit) floating-point elements in a within 256-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutex_pd&expand=4213)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermpd
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_maskz_permutex_pd<const MASK: i32>(k: __mmask8, a: __m512d) -> __m512d {
    let r = _mm512_permutex_pd::<MASK>(a);
    let zero = _mm512_setzero_pd().as_f64x8();
    transmute(simd_select_bitmask(k, r.as_f64x8(), zero))
}

/// Shuffle double-precision (64-bit) floating-point elements in a within 256-bit lanes using the control in imm8, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutex_pd&expand=4211)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermpd
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm256_permutex_pd<const MASK: i32>(a: __m256d) -> __m256d {
    static_assert_imm8!(MASK);
    simd_shuffle4!(
        a,
        a,
        <const MASK: i32> [
            MASK as u32 & 0b11,
            (MASK as u32 >> 2) & 0b11,
            ((MASK as u32 >> 4) & 0b11),
            ((MASK as u32 >> 6) & 0b11),
        ],
    )
}

/// Shuffle double-precision (64-bit) floating-point elements in a within 256-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_permutex_pd&expand=4209)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermpd
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm256_mask_permutex_pd<const MASK: i32>(
    src: __m256d,
    k: __mmask8,
    a: __m256d,
) -> __m256d {
    static_assert_imm8!(MASK);
    let r = _mm256_permutex_pd::<MASK>(a);
    transmute(simd_select_bitmask(k, r.as_f64x4(), src.as_f64x4()))
}

/// Shuffle double-precision (64-bit) floating-point elements in a within 256-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_permutex_pd&expand=4210)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vperm, MASK = 0b10_01_10_11))] //should be vpermpd
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm256_maskz_permutex_pd<const MASK: i32>(k: __mmask8, a: __m256d) -> __m256d {
    static_assert_imm8!(MASK);
    let r = _mm256_permutex_pd::<MASK>(a);
    let zero = _mm256_setzero_pd().as_f64x4();
    transmute(simd_select_bitmask(k, r.as_f64x4(), zero))
}

/// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst. Note that this intrinsic shuffles across 128-bit lanes, unlike past intrinsics that use the permutevar name. This intrinsic is identical to _mm512_permutexvar_epi32, and it is recommended that you use that intrinsic name.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutevar_epi32&expand=4182)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vperm))] //should be vpermd
pub unsafe fn _mm512_permutevar_epi32(idx: __m512i, a: __m512i) -> __m512i {
    transmute(vpermd(a.as_i32x16(), idx.as_i32x16()))
}

/// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). Note that this intrinsic shuffles across 128-bit lanes, unlike past intrinsics that use the permutevar name. This intrinsic is identical to _mm512_mask_permutexvar_epi32, and it is recommended that you use that intrinsic name.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutevar_epi32&expand=4181)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpermd))]
pub unsafe fn _mm512_mask_permutevar_epi32(
    src: __m512i,
    k: __mmask16,
    idx: __m512i,
    a: __m512i,
) -> __m512i {
    let permute = _mm512_permutevar_epi32(idx, a).as_i32x16();
    transmute(simd_select_bitmask(k, permute, src.as_i32x16()))
}

/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutevar_ps&expand=4200)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpermilps))]
pub unsafe fn _mm512_permutevar_ps(a: __m512, b: __m512i) -> __m512 {
    transmute(vpermilps(a.as_f32x16(), b.as_i32x16()))
}

/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutevar_ps&expand=4198)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpermilps))]
pub unsafe fn _mm512_mask_permutevar_ps(
    src: __m512,
    k: __mmask16,
    a: __m512,
    b: __m512i,
) -> __m512 {
    let permute = _mm512_permutevar_ps(a, b).as_f32x16();
    transmute(simd_select_bitmask(k, permute, src.as_f32x16()))
}

/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutevar_ps&expand=4199)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpermilps))]
pub unsafe fn _mm512_maskz_permutevar_ps(k: __mmask16, a: __m512, b: __m512i) -> __m512 {
    let permute = _mm512_permutevar_ps(a, b).as_f32x16();
    let zero = _mm512_setzero_ps().as_f32x16();
    transmute(simd_select_bitmask(k, permute, zero))
}

/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm256_mask_permutevar_ps&expand=4195)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpermilps))]
pub unsafe fn _mm256_mask_permutevar_ps(src: __m256, k: __mmask8, a: __m256, b: __m256i) -> __m256 {
    let permute = _mm256_permutevar_ps(a, b).as_f32x8();
    transmute(simd_select_bitmask(k, permute, src.as_f32x8()))
}

/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_permutevar_ps&expand=4196)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpermilps))]
pub unsafe fn _mm256_maskz_permutevar_ps(k: __mmask8, a: __m256, b: __m256i) -> __m256 {
    let permute = _mm256_permutevar_ps(a, b).as_f32x8();
    let zero = _mm256_setzero_ps().as_f32x8();
    transmute(simd_select_bitmask(k, permute, zero))
}

/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_permutevar_ps&expand=4192)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpermilps))]
pub unsafe fn _mm_mask_permutevar_ps(src: __m128, k: __mmask8, a: __m128, b: __m128i) -> __m128 {
    let permute = _mm_permutevar_ps(a, b).as_f32x4();
    transmute(simd_select_bitmask(k, permute, src.as_f32x4()))
}

/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_permutevar_ps&expand=4193)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpermilps))]
pub unsafe fn _mm_maskz_permutevar_ps(k: __mmask8, a: __m128, b: __m128i) -> __m128 {
    let permute = _mm_permutevar_ps(a, b).as_f32x4();
    let zero = _mm_setzero_ps().as_f32x4();
    transmute(simd_select_bitmask(k, permute, zero))
}

/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutevar_pd&expand=4191)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpermilpd))]
pub unsafe fn _mm512_permutevar_pd(a: __m512d, b: __m512i) -> __m512d {
    transmute(vpermilpd(a.as_f64x8(), b.as_i64x8()))
}

/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutevar_pd&expand=4189)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpermilpd))]
pub unsafe fn _mm512_mask_permutevar_pd(
    src: __m512d,
    k: __mmask8,
    a: __m512d,
    b: __m512i,
) -> __m512d {
    let permute = _mm512_permutevar_pd(a, b).as_f64x8();
    transmute(simd_select_bitmask(k, permute, src.as_f64x8()))
}

/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutevar_pd&expand=4190)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpermilpd))]
pub unsafe fn _mm512_maskz_permutevar_pd(k: __mmask8, a: __m512d, b: __m512i) -> __m512d {
    let permute = _mm512_permutevar_pd(a, b).as_f64x8();
    let zero = _mm512_setzero_pd().as_f64x8();
    transmute(simd_select_bitmask(k, permute, zero))
}

/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_permutevar_pd&expand=4186)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpermilpd))]
pub unsafe fn _mm256_mask_permutevar_pd(
    src: __m256d,
    k: __mmask8,
    a: __m256d,
    b: __m256i,
) -> __m256d {
    let permute = _mm256_permutevar_pd(a, b).as_f64x4();
    transmute(simd_select_bitmask(k, permute, src.as_f64x4()))
}

/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_permutevar_pd&expand=4187)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpermilpd))]
pub unsafe fn _mm256_maskz_permutevar_pd(k: __mmask8, a: __m256d, b: __m256i) -> __m256d {
    let permute = _mm256_permutevar_pd(a, b).as_f64x4();
    let zero = _mm256_setzero_pd().as_f64x4();
    transmute(simd_select_bitmask(k, permute, zero))
}

/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_permutevar_pd&expand=4183)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpermilpd))]
pub unsafe fn _mm_mask_permutevar_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128i) -> __m128d {
    let permute = _mm_permutevar_pd(a, b).as_f64x2();
    transmute(simd_select_bitmask(k, permute, src.as_f64x2()))
}

/// Shuffle double-precision (64-bit) floating-point elements in a within 128-bit lanes using the control in b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_permutevar_pd&expand=4184)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpermilpd))]
pub unsafe fn _mm_maskz_permutevar_pd(k: __mmask8, a: __m128d, b: __m128i) -> __m128d {
    let permute = _mm_permutevar_pd(a, b).as_f64x2();
    let zero = _mm_setzero_pd().as_f64x2();
    transmute(simd_select_bitmask(k, permute, zero))
}

/// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutexvar_epi32&expand=4301)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vperm))] //should be vpermd
pub unsafe fn _mm512_permutexvar_epi32(idx: __m512i, a: __m512i) -> __m512i {
    transmute(vpermd(a.as_i32x16(), idx.as_i32x16()))
}

/// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutexvar_epi32&expand=4299)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpermd))]
pub unsafe fn _mm512_mask_permutexvar_epi32(
    src: __m512i,
    k: __mmask16,
    idx: __m512i,
    a: __m512i,
) -> __m512i {
    let permute = _mm512_permutexvar_epi32(idx, a).as_i32x16();
    transmute(simd_select_bitmask(k, permute, src.as_i32x16()))
}

/// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutexvar_epi32&expand=4300)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpermd))]
pub unsafe fn _mm512_maskz_permutexvar_epi32(k: __mmask16, idx: __m512i, a: __m512i) -> __m512i {
    let permute = _mm512_permutexvar_epi32(idx, a).as_i32x16();
    let zero = _mm512_setzero_si512().as_i32x16();
    transmute(simd_select_bitmask(k, permute, zero))
}

/// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutexvar_epi32&expand=4298)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vperm))] //should be vpermd
pub unsafe fn _mm256_permutexvar_epi32(idx: __m256i, a: __m256i) -> __m256i {
    transmute(_mm256_permutevar8x32_epi32(a, idx)) // llvm use llvm.x86.avx2.permd
}

/// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_permutexvar_epi32&expand=4296)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpermd))]
pub unsafe fn _mm256_mask_permutexvar_epi32(
    src: __m256i,
    k: __mmask8,
    idx: __m256i,
    a: __m256i,
) -> __m256i {
    let permute = _mm256_permutexvar_epi32(idx, a).as_i32x8();
    transmute(simd_select_bitmask(k, permute, src.as_i32x8()))
}

/// Shuffle 32-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_permutexvar_epi32&expand=4297)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpermd))]
pub unsafe fn _mm256_maskz_permutexvar_epi32(k: __mmask8, idx: __m256i, a: __m256i) -> __m256i {
    let permute = _mm256_permutexvar_epi32(idx, a).as_i32x8();
    let zero = _mm256_setzero_si256().as_i32x8();
    transmute(simd_select_bitmask(k, permute, zero))
}

/// Shuffle 64-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutexvar_epi64&expand=4307)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vperm))] //should be vpermq
pub unsafe fn _mm512_permutexvar_epi64(idx: __m512i, a: __m512i) -> __m512i {
    transmute(vpermq(a.as_i64x8(), idx.as_i64x8()))
}

/// Shuffle 64-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutexvar_epi64&expand=4305)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpermq))]
pub unsafe fn _mm512_mask_permutexvar_epi64(
    src: __m512i,
    k: __mmask8,
    idx: __m512i,
    a: __m512i,
) -> __m512i {
    let permute = _mm512_permutexvar_epi64(idx, a).as_i64x8();
    transmute(simd_select_bitmask(k, permute, src.as_i64x8()))
}

/// Shuffle 64-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutexvar_epi64&expand=4306)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpermq))]
pub unsafe fn _mm512_maskz_permutexvar_epi64(k: __mmask8, idx: __m512i, a: __m512i) -> __m512i {
    let permute = _mm512_permutexvar_epi64(idx, a).as_i64x8();
    let zero = _mm512_setzero_si512().as_i64x8();
    transmute(simd_select_bitmask(k, permute, zero))
}

/// Shuffle 64-bit integers in a across lanes using the corresponding index in idx, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutexvar_epi64&expand=4304)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vperm))] //should be vpermq
pub unsafe fn _mm256_permutexvar_epi64(idx: __m256i, a: __m256i) -> __m256i {
    transmute(vpermq256(a.as_i64x4(), idx.as_i64x4()))
}

/// Shuffle 64-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_permutexvar_epi64&expand=4302)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpermq))]
pub unsafe fn _mm256_mask_permutexvar_epi64(
    src: __m256i,
    k: __mmask8,
    idx: __m256i,
    a: __m256i,
) -> __m256i {
    let permute = _mm256_permutexvar_epi64(idx, a).as_i64x4();
    transmute(simd_select_bitmask(k, permute, src.as_i64x4()))
}

/// Shuffle 64-bit integers in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_permutexvar_epi64&expand=4303)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpermq))]
pub unsafe fn _mm256_maskz_permutexvar_epi64(k: __mmask8, idx: __m256i, a: __m256i) -> __m256i {
    let permute = _mm256_permutexvar_epi64(idx, a).as_i64x4();
    let zero = _mm256_setzero_si256().as_i64x4();
    transmute(simd_select_bitmask(k, permute, zero))
}

/// Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutevar_ps&expand=4200)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpermps))]
pub unsafe fn _mm512_permutexvar_ps(idx: __m512i, a: __m512) -> __m512 {
    transmute(vpermps(a.as_f32x16(), idx.as_i32x16()))
}

/// Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutexvar_ps&expand=4326)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpermps))]
pub unsafe fn _mm512_mask_permutexvar_ps(
    src: __m512,
    k: __mmask16,
    idx: __m512i,
    a: __m512,
) -> __m512 {
    let permute = _mm512_permutexvar_ps(idx, a).as_f32x16();
    transmute(simd_select_bitmask(k, permute, src.as_f32x16()))
}

/// Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutexvar_ps&expand=4327)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpermps))]
pub unsafe fn _mm512_maskz_permutexvar_ps(k: __mmask16, idx: __m512i, a: __m512) -> __m512 {
    let permute = _mm512_permutexvar_ps(idx, a).as_f32x16();
    let zero = _mm512_setzero_ps().as_f32x16();
    transmute(simd_select_bitmask(k, permute, zero))
}

/// Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutexvar_ps&expand=4325)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpermps))]
pub unsafe fn _mm256_permutexvar_ps(idx: __m256i, a: __m256) -> __m256 {
    transmute(_mm256_permutevar8x32_ps(a, idx)) //llvm.x86.avx2.permps
}

/// Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_permutexvar_ps&expand=4323)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpermps))]
pub unsafe fn _mm256_mask_permutexvar_ps(
    src: __m256,
    k: __mmask8,
    idx: __m256i,
    a: __m256,
) -> __m256 {
    let permute = _mm256_permutexvar_ps(idx, a).as_f32x8();
    transmute(simd_select_bitmask(k, permute, src.as_f32x8()))
}

/// Shuffle single-precision (32-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_permutexvar_ps&expand=4324)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpermps))]
pub unsafe fn _mm256_maskz_permutexvar_ps(k: __mmask8, idx: __m256i, a: __m256) -> __m256 {
    let permute = _mm256_permutexvar_ps(idx, a).as_f32x8();
    let zero = _mm256_setzero_ps().as_f32x8();
    transmute(simd_select_bitmask(k, permute, zero))
}

/// Shuffle double-precision (64-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutexvar_pd&expand=4322)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpermpd))]
pub unsafe fn _mm512_permutexvar_pd(idx: __m512i, a: __m512d) -> __m512d {
    transmute(vpermpd(a.as_f64x8(), idx.as_i64x8()))
}

/// Shuffle double-precision (64-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutexvar_pd&expand=4320)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpermpd))]
pub unsafe fn _mm512_mask_permutexvar_pd(
    src: __m512d,
    k: __mmask8,
    idx: __m512i,
    a: __m512d,
) -> __m512d {
    let permute = _mm512_permutexvar_pd(idx, a).as_f64x8();
    transmute(simd_select_bitmask(k, permute, src.as_f64x8()))
}

/// Shuffle double-precision (64-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutexvar_pd&expand=4321)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpermpd))]
pub unsafe fn _mm512_maskz_permutexvar_pd(k: __mmask8, idx: __m512i, a: __m512d) -> __m512d {
    let permute = _mm512_permutexvar_pd(idx, a).as_f64x8();
    let zero = _mm512_setzero_pd().as_f64x8();
    transmute(simd_select_bitmask(k, permute, zero))
}

/// Shuffle double-precision (64-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutexvar_pd&expand=4319)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpermpd))]
pub unsafe fn _mm256_permutexvar_pd(idx: __m256i, a: __m256d) -> __m256d {
    transmute(vpermpd256(a.as_f64x4(), idx.as_i64x4()))
}

/// Shuffle double-precision (64-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_permutexvar_pd&expand=4317)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpermpd))]
pub unsafe fn _mm256_mask_permutexvar_pd(
    src: __m256d,
    k: __mmask8,
    idx: __m256i,
    a: __m256d,
) -> __m256d {
    let permute = _mm256_permutexvar_pd(idx, a).as_f64x4();
    transmute(simd_select_bitmask(k, permute, src.as_f64x4()))
}

/// Shuffle double-precision (64-bit) floating-point elements in a across lanes using the corresponding index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_permutexvar_pd&expand=4318)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpermpd))]
pub unsafe fn _mm256_maskz_permutexvar_pd(k: __mmask8, idx: __m256i, a: __m256d) -> __m256d {
    let permute = _mm256_permutexvar_pd(idx, a).as_f64x4();
    let zero = _mm256_setzero_pd().as_f64x4();
    transmute(simd_select_bitmask(k, permute, zero))
}

/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutex2var_epi32&expand=4238)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vperm))] //vpermi2d or vpermt2d
pub unsafe fn _mm512_permutex2var_epi32(a: __m512i, idx: __m512i, b: __m512i) -> __m512i {
    transmute(vpermi2d(a.as_i32x16(), idx.as_i32x16(), b.as_i32x16()))
}

/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutex2var_epi32&expand=4235)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpermt2d))]
pub unsafe fn _mm512_mask_permutex2var_epi32(
    a: __m512i,
    k: __mmask16,
    idx: __m512i,
    b: __m512i,
) -> __m512i {
    let permute = _mm512_permutex2var_epi32(a, idx, b).as_i32x16();
    transmute(simd_select_bitmask(k, permute, a.as_i32x16()))
}

/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutex2var_epi32&expand=4237)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vperm))] //vpermi2d or vpermt2d
pub unsafe fn _mm512_maskz_permutex2var_epi32(
    k: __mmask16,
    a: __m512i,
    idx: __m512i,
    b: __m512i,
) -> __m512i {
    let permute = _mm512_permutex2var_epi32(a, idx, b).as_i32x16();
    let zero = _mm512_setzero_si512().as_i32x16();
    transmute(simd_select_bitmask(k, permute, zero))
}

/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask2_permutex2var_epi32&expand=4236)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpermi2d))]
pub unsafe fn _mm512_mask2_permutex2var_epi32(
    a: __m512i,
    idx: __m512i,
    k: __mmask16,
    b: __m512i,
) -> __m512i {
    let permute = _mm512_permutex2var_epi32(a, idx, b).as_i32x16();
    transmute(simd_select_bitmask(k, permute, idx.as_i32x16()))
}

/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutex2var_epi32&expand=4234)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vperm))] //vpermi2d or vpermt2d
pub unsafe fn _mm256_permutex2var_epi32(a: __m256i, idx: __m256i, b: __m256i) -> __m256i {
    transmute(vpermi2d256(a.as_i32x8(), idx.as_i32x8(), b.as_i32x8()))
}

/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_permutex2var_epi32&expand=4231)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpermt2d))]
pub unsafe fn _mm256_mask_permutex2var_epi32(
    a: __m256i,
    k: __mmask8,
    idx: __m256i,
    b: __m256i,
) -> __m256i {
    let permute = _mm256_permutex2var_epi32(a, idx, b).as_i32x8();
    transmute(simd_select_bitmask(k, permute, a.as_i32x8()))
}

/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_permutex2var_epi32&expand=4233)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vperm))] //vpermi2d or vpermt2d
pub unsafe fn _mm256_maskz_permutex2var_epi32(
    k: __mmask8,
    a: __m256i,
    idx: __m256i,
    b: __m256i,
) -> __m256i {
    let permute = _mm256_permutex2var_epi32(a, idx, b).as_i32x8();
    let zero = _mm256_setzero_si256().as_i32x8();
    transmute(simd_select_bitmask(k, permute, zero))
}

/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask2_permutex2var_epi32&expand=4232)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpermi2d))]
pub unsafe fn _mm256_mask2_permutex2var_epi32(
    a: __m256i,
    idx: __m256i,
    k: __mmask8,
    b: __m256i,
) -> __m256i {
    let permute = _mm256_permutex2var_epi32(a, idx, b).as_i32x8();
    transmute(simd_select_bitmask(k, permute, idx.as_i32x8()))
}

/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_permutex2var_epi32&expand=4230)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vperm))] //vpermi2d or vpermt2d
pub unsafe fn _mm_permutex2var_epi32(a: __m128i, idx: __m128i, b: __m128i) -> __m128i {
    transmute(vpermi2d128(a.as_i32x4(), idx.as_i32x4(), b.as_i32x4()))
}

/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_permutex2var_epi32&expand=4227)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpermt2d))]
pub unsafe fn _mm_mask_permutex2var_epi32(
    a: __m128i,
    k: __mmask8,
    idx: __m128i,
    b: __m128i,
) -> __m128i {
    let permute = _mm_permutex2var_epi32(a, idx, b).as_i32x4();
    transmute(simd_select_bitmask(k, permute, a.as_i32x4()))
}

/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_permutex2var_epi32&expand=4229)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vperm))] //vpermi2d or vpermt2d
pub unsafe fn _mm_maskz_permutex2var_epi32(
    k: __mmask8,
    a: __m128i,
    idx: __m128i,
    b: __m128i,
) -> __m128i {
    let permute = _mm_permutex2var_epi32(a, idx, b).as_i32x4();
    let zero = _mm_setzero_si128().as_i32x4();
    transmute(simd_select_bitmask(k, permute, zero))
}

/// Shuffle 32-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask2_permutex2var_epi32&expand=4228)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpermi2d))]
pub unsafe fn _mm_mask2_permutex2var_epi32(
    a: __m128i,
    idx: __m128i,
    k: __mmask8,
    b: __m128i,
) -> __m128i {
    let permute = _mm_permutex2var_epi32(a, idx, b).as_i32x4();
    transmute(simd_select_bitmask(k, permute, idx.as_i32x4()))
}

/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutex2var_epi64&expand=4250)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vperm))] //vpermi2q or vpermt2q
pub unsafe fn _mm512_permutex2var_epi64(a: __m512i, idx: __m512i, b: __m512i) -> __m512i {
    transmute(vpermi2q(a.as_i64x8(), idx.as_i64x8(), b.as_i64x8()))
}

/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutex2var_epi64&expand=4247)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpermt2q))]
pub unsafe fn _mm512_mask_permutex2var_epi64(
    a: __m512i,
    k: __mmask8,
    idx: __m512i,
    b: __m512i,
) -> __m512i {
    let permute = _mm512_permutex2var_epi64(a, idx, b).as_i64x8();
    transmute(simd_select_bitmask(k, permute, a.as_i64x8()))
}

/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutex2var_epi64&expand=4249)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vperm))] //vpermi2q or vpermt2q
pub unsafe fn _mm512_maskz_permutex2var_epi64(
    k: __mmask8,
    a: __m512i,
    idx: __m512i,
    b: __m512i,
) -> __m512i {
    let permute = _mm512_permutex2var_epi64(a, idx, b).as_i64x8();
    let zero = _mm512_setzero_si512().as_i64x8();
    transmute(simd_select_bitmask(k, permute, zero))
}

/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask2_permutex2var_epi64&expand=4248)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpermi2q))]
pub unsafe fn _mm512_mask2_permutex2var_epi64(
    a: __m512i,
    idx: __m512i,
    k: __mmask8,
    b: __m512i,
) -> __m512i {
    let permute = _mm512_permutex2var_epi64(a, idx, b).as_i64x8();
    transmute(simd_select_bitmask(k, permute, idx.as_i64x8()))
}

/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutex2var_epi64&expand=4246)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vperm))] //vpermi2q or vpermt2q
pub unsafe fn _mm256_permutex2var_epi64(a: __m256i, idx: __m256i, b: __m256i) -> __m256i {
    transmute(vpermi2q256(a.as_i64x4(), idx.as_i64x4(), b.as_i64x4()))
}

/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_permutex2var_epi64&expand=4243)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpermt2q))]
pub unsafe fn _mm256_mask_permutex2var_epi64(
    a: __m256i,
    k: __mmask8,
    idx: __m256i,
    b: __m256i,
) -> __m256i {
    let permute = _mm256_permutex2var_epi64(a, idx, b).as_i64x4();
    transmute(simd_select_bitmask(k, permute, a.as_i64x4()))
}

/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_permutex2var_epi64&expand=4245)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vperm))] //vpermi2q or vpermt2q
pub unsafe fn _mm256_maskz_permutex2var_epi64(
    k: __mmask8,
    a: __m256i,
    idx: __m256i,
    b: __m256i,
) -> __m256i {
    let permute = _mm256_permutex2var_epi64(a, idx, b).as_i64x4();
    let zero = _mm256_setzero_si256().as_i64x4();
    transmute(simd_select_bitmask(k, permute, zero))
}

/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask2_permutex2var_epi64&expand=4244)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpermi2q))]
pub unsafe fn _mm256_mask2_permutex2var_epi64(
    a: __m256i,
    idx: __m256i,
    k: __mmask8,
    b: __m256i,
) -> __m256i {
    let permute = _mm256_permutex2var_epi64(a, idx, b).as_i64x4();
    transmute(simd_select_bitmask(k, permute, idx.as_i64x4()))
}

/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_permutex2var_epi64&expand=4242)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vperm))] //vpermi2q or vpermt2q
pub unsafe fn _mm_permutex2var_epi64(a: __m128i, idx: __m128i, b: __m128i) -> __m128i {
    transmute(vpermi2q128(a.as_i64x2(), idx.as_i64x2(), b.as_i64x2()))
}

/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_permutex2var_epi64&expand=4239)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpermt2q))]
pub unsafe fn _mm_mask_permutex2var_epi64(
    a: __m128i,
    k: __mmask8,
    idx: __m128i,
    b: __m128i,
) -> __m128i {
    let permute = _mm_permutex2var_epi64(a, idx, b).as_i64x2();
    transmute(simd_select_bitmask(k, permute, a.as_i64x2()))
}

/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_permutex2var_epi64&expand=4241)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vperm))] //vpermi2q or vpermt2q
pub unsafe fn _mm_maskz_permutex2var_epi64(
    k: __mmask8,
    a: __m128i,
    idx: __m128i,
    b: __m128i,
) -> __m128i {
    let permute = _mm_permutex2var_epi64(a, idx, b).as_i64x2();
    let zero = _mm_setzero_si128().as_i64x2();
    transmute(simd_select_bitmask(k, permute, zero))
}

/// Shuffle 64-bit integers in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask2_permutex2var_epi64&expand=4240)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpermi2q))]
pub unsafe fn _mm_mask2_permutex2var_epi64(
    a: __m128i,
    idx: __m128i,
    k: __mmask8,
    b: __m128i,
) -> __m128i {
    let permute = _mm_permutex2var_epi64(a, idx, b).as_i64x2();
    transmute(simd_select_bitmask(k, permute, idx.as_i64x2()))
}

/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutex2var_ps&expand=4286)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vperm))] //vpermi2ps or vpermt2ps
pub unsafe fn _mm512_permutex2var_ps(a: __m512, idx: __m512i, b: __m512) -> __m512 {
    transmute(vpermi2ps(a.as_f32x16(), idx.as_i32x16(), b.as_f32x16()))
}

/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutex2var_ps&expand=4283)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpermt2ps))]
pub unsafe fn _mm512_mask_permutex2var_ps(
    a: __m512,
    k: __mmask16,
    idx: __m512i,
    b: __m512,
) -> __m512 {
    let permute = _mm512_permutex2var_ps(a, idx, b).as_f32x16();
    transmute(simd_select_bitmask(k, permute, a.as_f32x16()))
}

/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutex2var_ps&expand=4285)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vperm))] //vpermi2ps or vpermt2ps
pub unsafe fn _mm512_maskz_permutex2var_ps(
    k: __mmask16,
    a: __m512,
    idx: __m512i,
    b: __m512,
) -> __m512 {
    let permute = _mm512_permutex2var_ps(a, idx, b).as_f32x16();
    let zero = _mm512_setzero_ps().as_f32x16();
    transmute(simd_select_bitmask(k, permute, zero))
}

/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask2_permutex2var_ps&expand=4284)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vperm))] //should be vpermi2ps, but it shows vpermt2ps
pub unsafe fn _mm512_mask2_permutex2var_ps(
    a: __m512,
    idx: __m512i,
    k: __mmask16,
    b: __m512,
) -> __m512 {
    let permute = _mm512_permutex2var_ps(a, idx, b).as_f32x16();
    let idx = _mm512_castsi512_ps(idx).as_f32x16();
    transmute(simd_select_bitmask(k, permute, idx))
}

/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutex2var_ps&expand=4282)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vperm))] //vpermi2ps or vpermt2ps
pub unsafe fn _mm256_permutex2var_ps(a: __m256, idx: __m256i, b: __m256) -> __m256 {
    transmute(vpermi2ps256(a.as_f32x8(), idx.as_i32x8(), b.as_f32x8()))
}

/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_permutex2var_ps&expand=4279)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpermt2ps))]
pub unsafe fn _mm256_mask_permutex2var_ps(
    a: __m256,
    k: __mmask8,
    idx: __m256i,
    b: __m256,
) -> __m256 {
    let permute = _mm256_permutex2var_ps(a, idx, b).as_f32x8();
    transmute(simd_select_bitmask(k, permute, a.as_f32x8()))
}

/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_permutex2var_ps&expand=4281)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vperm))] //vpermi2ps or vpermt2ps
pub unsafe fn _mm256_maskz_permutex2var_ps(
    k: __mmask8,
    a: __m256,
    idx: __m256i,
    b: __m256,
) -> __m256 {
    let permute = _mm256_permutex2var_ps(a, idx, b).as_f32x8();
    let zero = _mm256_setzero_ps().as_f32x8();
    transmute(simd_select_bitmask(k, permute, zero))
}

/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask2_permutex2var_ps&expand=4280)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vperm))] //should be vpermi2ps, but it shows vpermt2ps
pub unsafe fn _mm256_mask2_permutex2var_ps(
    a: __m256,
    idx: __m256i,
    k: __mmask8,
    b: __m256,
) -> __m256 {
    let permute = _mm256_permutex2var_ps(a, idx, b).as_f32x8();
    let idx = _mm256_castsi256_ps(idx).as_f32x8();
    transmute(simd_select_bitmask(k, permute, idx))
}

/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_permutex2var_ps&expand=4278)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vperm))] //vpermi2ps or vpermt2ps
pub unsafe fn _mm_permutex2var_ps(a: __m128, idx: __m128i, b: __m128) -> __m128 {
    transmute(vpermi2ps128(a.as_f32x4(), idx.as_i32x4(), b.as_f32x4()))
}

/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_permutex2var_ps&expand=4275)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpermt2ps))]
pub unsafe fn _mm_mask_permutex2var_ps(a: __m128, k: __mmask8, idx: __m128i, b: __m128) -> __m128 {
    let permute = _mm_permutex2var_ps(a, idx, b).as_f32x4();
    transmute(simd_select_bitmask(k, permute, a.as_f32x4()))
}

/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_permutex2var_ps&expand=4277)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vperm))] //vpermi2ps or vpermt2ps
pub unsafe fn _mm_maskz_permutex2var_ps(k: __mmask8, a: __m128, idx: __m128i, b: __m128) -> __m128 {
    let permute = _mm_permutex2var_ps(a, idx, b).as_f32x4();
    let zero = _mm_setzero_ps().as_f32x4();
    transmute(simd_select_bitmask(k, permute, zero))
}

/// Shuffle single-precision (32-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask2_permutex2var_ps&expand=4276)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vperm))] //should be vpermi2ps, but it shows vpermt2ps
pub unsafe fn _mm_mask2_permutex2var_ps(a: __m128, idx: __m128i, k: __mmask8, b: __m128) -> __m128 {
    let permute = _mm_permutex2var_ps(a, idx, b).as_f32x4();
    let idx = _mm_castsi128_ps(idx).as_f32x4();
    transmute(simd_select_bitmask(k, permute, idx))
}

/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_permutex2var_pd&expand=4274)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vperm))] //vpermi2pd or vpermt2pd
pub unsafe fn _mm512_permutex2var_pd(a: __m512d, idx: __m512i, b: __m512d) -> __m512d {
    transmute(vpermi2pd(a.as_f64x8(), idx.as_i64x8(), b.as_f64x8()))
}

/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_permutex2var_pd&expand=4271)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpermt2pd))]
pub unsafe fn _mm512_mask_permutex2var_pd(
    a: __m512d,
    k: __mmask8,
    idx: __m512i,
    b: __m512d,
) -> __m512d {
    let permute = _mm512_permutex2var_pd(a, idx, b).as_f64x8();
    transmute(simd_select_bitmask(k, permute, a.as_f64x8()))
}

/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_permutex2var_pd&expand=4273)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vperm))] //vpermi2pd or vpermt2pd
pub unsafe fn _mm512_maskz_permutex2var_pd(
    k: __mmask8,
    a: __m512d,
    idx: __m512i,
    b: __m512d,
) -> __m512d {
    let permute = _mm512_permutex2var_pd(a, idx, b).as_f64x8();
    let zero = _mm512_setzero_pd().as_f64x8();
    transmute(simd_select_bitmask(k, permute, zero))
}

/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set)
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask2_permutex2var_pd&expand=4272)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vperm))] //should be vpermi2pd, but it shows vpermt2pd
pub unsafe fn _mm512_mask2_permutex2var_pd(
    a: __m512d,
    idx: __m512i,
    k: __mmask8,
    b: __m512d,
) -> __m512d {
    let permute = _mm512_permutex2var_pd(a, idx, b).as_f64x8();
    let idx = _mm512_castsi512_pd(idx).as_f64x8();
    transmute(simd_select_bitmask(k, permute, idx))
}

/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_permutex2var_pd&expand=4270)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vperm))] //vpermi2pd or vpermt2pd
pub unsafe fn _mm256_permutex2var_pd(a: __m256d, idx: __m256i, b: __m256d) -> __m256d {
    transmute(vpermi2pd256(a.as_f64x4(), idx.as_i64x4(), b.as_f64x4()))
}

/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_permutex2var_pd&expand=4267)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpermt2pd))]
pub unsafe fn _mm256_mask_permutex2var_pd(
    a: __m256d,
    k: __mmask8,
    idx: __m256i,
    b: __m256d,
) -> __m256d {
    let permute = _mm256_permutex2var_pd(a, idx, b).as_f64x4();
    transmute(simd_select_bitmask(k, permute, a.as_f64x4()))
}

/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_permutex2var_pd&expand=4269)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vperm))] //vpermi2pd or vpermt2pd
pub unsafe fn _mm256_maskz_permutex2var_pd(
    k: __mmask8,
    a: __m256d,
    idx: __m256i,
    b: __m256d,
) -> __m256d {
    let permute = _mm256_permutex2var_pd(a, idx, b).as_f64x4();
    let zero = _mm256_setzero_pd().as_f64x4();
    transmute(simd_select_bitmask(k, permute, zero))
}

/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set)
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask2_permutex2var_pd&expand=4268)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vperm))] //should be vpermi2pd, but it shows vpermt2pd
pub unsafe fn _mm256_mask2_permutex2var_pd(
    a: __m256d,
    idx: __m256i,
    k: __mmask8,
    b: __m256d,
) -> __m256d {
    let permute = _mm256_permutex2var_pd(a, idx, b).as_f64x4();
    let idx = _mm256_castsi256_pd(idx).as_f64x4();
    transmute(simd_select_bitmask(k, permute, idx))
}

/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_permutex2var_pd&expand=4266)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vperm))] //vpermi2pd or vpermt2pd
pub unsafe fn _mm_permutex2var_pd(a: __m128d, idx: __m128i, b: __m128d) -> __m128d {
    transmute(vpermi2pd128(a.as_f64x2(), idx.as_i64x2(), b.as_f64x2()))
}

/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from a when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_permutex2var_pd&expand=4263)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpermt2pd))]
pub unsafe fn _mm_mask_permutex2var_pd(
    a: __m128d,
    k: __mmask8,
    idx: __m128i,
    b: __m128d,
) -> __m128d {
    let permute = _mm_permutex2var_pd(a, idx, b).as_f64x2();
    transmute(simd_select_bitmask(k, permute, a.as_f64x2()))
}

/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_permutex2var_pd&expand=4265)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vperm))] //vpermi2pd or vpermt2pd
pub unsafe fn _mm_maskz_permutex2var_pd(
    k: __mmask8,
    a: __m128d,
    idx: __m128i,
    b: __m128d,
) -> __m128d {
    let permute = _mm_permutex2var_pd(a, idx, b).as_f64x2();
    let zero = _mm_setzero_pd().as_f64x2();
    transmute(simd_select_bitmask(k, permute, zero))
}

/// Shuffle double-precision (64-bit) floating-point elements in a and b across lanes using the corresponding selector and index in idx, and store the results in dst using writemask k (elements are copied from idx when the corresponding mask bit is not set)
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask2_permutex2var_pd&expand=4264)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vperm))] //should be vpermi2pd, but it shows vpermt2pd
pub unsafe fn _mm_mask2_permutex2var_pd(
    a: __m128d,
    idx: __m128i,
    k: __mmask8,
    b: __m128d,
) -> __m128d {
    let permute = _mm_permutex2var_pd(a, idx, b).as_f64x2();
    let idx = _mm_castsi128_pd(idx).as_f64x2();
    transmute(simd_select_bitmask(k, permute, idx))
}

/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_shuffle_epi32&expand=5150)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpermilps, MASK = 9))] //should be vpshufd
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm512_shuffle_epi32<const MASK: _MM_PERM_ENUM>(a: __m512i) -> __m512i {
    static_assert_imm8!(MASK);
    let r: i32x16 = simd_shuffle16!(
        a.as_i32x16(),
        a.as_i32x16(),
        <const MASK: _MM_PERM_ENUM> [
            MASK as u32 & 0b11,
            (MASK as u32 >> 2) & 0b11,
            (MASK as u32 >> 4) & 0b11,
            (MASK as u32 >> 6) & 0b11,
            (MASK as u32 & 0b11) + 4,
            ((MASK as u32 >> 2) & 0b11) + 4,
            ((MASK as u32 >> 4) & 0b11) + 4,
            ((MASK as u32 >> 6) & 0b11) + 4,
            (MASK as u32 & 0b11) + 8,
            ((MASK as u32 >> 2) & 0b11) + 8,
            ((MASK as u32 >> 4) & 0b11) + 8,
            ((MASK as u32 >> 6) & 0b11) + 8,
            (MASK as u32 & 0b11) + 12,
            ((MASK as u32 >> 2) & 0b11) + 12,
            ((MASK as u32 >> 4) & 0b11) + 12,
            ((MASK as u32 >> 6) & 0b11) + 12,
        ],
    );
    transmute(r)
}

/// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_epi32&expand=5148)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpshufd, MASK = 9))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_mask_shuffle_epi32<const MASK: _MM_PERM_ENUM>(
    src: __m512i,
    k: __mmask16,
    a: __m512i,
) -> __m512i {
    static_assert_imm8!(MASK);
    let r = _mm512_shuffle_epi32::<MASK>(a);
    transmute(simd_select_bitmask(k, r.as_i32x16(), src.as_i32x16()))
}

/// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_epi32&expand=5149)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpshufd, MASK = 9))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_maskz_shuffle_epi32<const MASK: _MM_PERM_ENUM>(
    k: __mmask16,
    a: __m512i,
) -> __m512i {
    static_assert_imm8!(MASK);
    let r = _mm512_shuffle_epi32::<MASK>(a);
    let zero = _mm512_setzero_si512().as_i32x16();
    transmute(simd_select_bitmask(k, r.as_i32x16(), zero))
}

/// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shuffle_epi32&expand=5145)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpshufd, MASK = 9))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm256_mask_shuffle_epi32<const MASK: _MM_PERM_ENUM>(
    src: __m256i,
    k: __mmask8,
    a: __m256i,
) -> __m256i {
    static_assert_imm8!(MASK);
    let r = _mm256_shuffle_epi32::<MASK>(a);
    transmute(simd_select_bitmask(k, r.as_i32x8(), src.as_i32x8()))
}

/// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shuffle_epi32&expand=5146)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpshufd, MASK = 9))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm256_maskz_shuffle_epi32<const MASK: _MM_PERM_ENUM>(
    k: __mmask8,
    a: __m256i,
) -> __m256i {
    static_assert_imm8!(MASK);
    let r = _mm256_shuffle_epi32::<MASK>(a);
    let zero = _mm256_setzero_si256().as_i32x8();
    transmute(simd_select_bitmask(k, r.as_i32x8(), zero))
}

/// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shuffle_epi32&expand=5142)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpshufd, MASK = 9))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm_mask_shuffle_epi32<const MASK: _MM_PERM_ENUM>(
    src: __m128i,
    k: __mmask8,
    a: __m128i,
) -> __m128i {
    static_assert_imm8!(MASK);
    let r = _mm_shuffle_epi32::<MASK>(a);
    transmute(simd_select_bitmask(k, r.as_i32x4(), src.as_i32x4()))
}

/// Shuffle 32-bit integers in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shuffle_epi32&expand=5143)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpshufd, MASK = 9))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm_maskz_shuffle_epi32<const MASK: _MM_PERM_ENUM>(
    k: __mmask8,
    a: __m128i,
) -> __m128i {
    static_assert_imm8!(MASK);
    let r = _mm_shuffle_epi32::<MASK>(a);
    let zero = _mm_setzero_si128().as_i32x4();
    transmute(simd_select_bitmask(k, r.as_i32x4(), zero))
}

/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shuffle_ps&expand=5203)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_shuffle_ps<const MASK: i32>(a: __m512, b: __m512) -> __m512 {
    static_assert_imm8!(MASK);
    simd_shuffle16!(
        a,
        b,
        <const MASK: i32> [
            MASK as u32 & 0b11,
            (MASK as u32 >> 2) & 0b11,
            ((MASK as u32 >> 4) & 0b11) + 16,
            ((MASK as u32 >> 6) & 0b11) + 16,
            (MASK as u32 & 0b11) + 4,
            ((MASK as u32 >> 2) & 0b11) + 4,
            ((MASK as u32 >> 4) & 0b11) + 20,
            ((MASK as u32 >> 6) & 0b11) + 20,
            (MASK as u32 & 0b11) + 8,
            ((MASK as u32 >> 2) & 0b11) + 8,
            ((MASK as u32 >> 4) & 0b11) + 24,
            ((MASK as u32 >> 6) & 0b11) + 24,
            (MASK as u32 & 0b11) + 12,
            ((MASK as u32 >> 2) & 0b11) + 12,
            ((MASK as u32 >> 4) & 0b11) + 28,
            ((MASK as u32 >> 6) & 0b11) + 28,
        ],
    )
}

/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_ps&expand=5201)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask_shuffle_ps<const MASK: i32>(
    src: __m512,
    k: __mmask16,
    a: __m512,
    b: __m512,
) -> __m512 {
    static_assert_imm8!(MASK);
    let r = _mm512_shuffle_ps::<MASK>(a, b);
    transmute(simd_select_bitmask(k, r.as_f32x16(), src.as_f32x16()))
}

/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_ps&expand=5202)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_maskz_shuffle_ps<const MASK: i32>(
    k: __mmask16,
    a: __m512,
    b: __m512,
) -> __m512 {
    static_assert_imm8!(MASK);
    let r = _mm512_shuffle_ps::<MASK>(a, b);
    let zero = _mm512_setzero_ps().as_f32x16();
    transmute(simd_select_bitmask(k, r.as_f32x16(), zero))
}

/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shuffle_ps&expand=5198)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm256_mask_shuffle_ps<const MASK: i32>(
    src: __m256,
    k: __mmask8,
    a: __m256,
    b: __m256,
) -> __m256 {
    static_assert_imm8!(MASK);
    let r = _mm256_shuffle_ps::<MASK>(a, b);
    transmute(simd_select_bitmask(k, r.as_f32x8(), src.as_f32x8()))
}

/// Shuffle single-precision (32-bit) floating-point elements in a within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shuffle_ps&expand=5199)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm256_maskz_shuffle_ps<const MASK: i32>(
    k: __mmask8,
    a: __m256,
    b: __m256,
) -> __m256 {
    static_assert_imm8!(MASK);
    let r = _mm256_shuffle_ps::<MASK>(a, b);
    let zero = _mm256_setzero_ps().as_f32x8();
    transmute(simd_select_bitmask(k, r.as_f32x8(), zero))
}

/// Shuffle single-precision (32-bit) floating-point elements in a using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shuffle_ps&expand=5195)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_mask_shuffle_ps<const MASK: i32>(
    src: __m128,
    k: __mmask8,
    a: __m128,
    b: __m128,
) -> __m128 {
    static_assert_imm8!(MASK);
    let r = _mm_shuffle_ps::<MASK>(a, b);
    transmute(simd_select_bitmask(k, r.as_f32x4(), src.as_f32x4()))
}

/// Shuffle single-precision (32-bit) floating-point elements in a using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shuffle_ps&expand=5196)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vshufps, MASK = 3))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm_maskz_shuffle_ps<const MASK: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
    static_assert_imm8!(MASK);
    let r = _mm_shuffle_ps::<MASK>(a, b);
    let zero = _mm_setzero_ps().as_f32x4();
    transmute(simd_select_bitmask(k, r.as_f32x4(), zero))
}

/// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shuffle_pd&expand=5192)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vshufpd, MASK = 3))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_shuffle_pd<const MASK: i32>(a: __m512d, b: __m512d) -> __m512d {
    static_assert_imm8!(MASK);
    simd_shuffle8!(
        a,
        b,
        <const MASK: i32> [
            MASK as u32 & 0b1,
            ((MASK as u32 >> 1) & 0b1) + 8,
            ((MASK as u32 >> 2) & 0b1) + 2,
            ((MASK as u32 >> 3) & 0b1) + 10,
            ((MASK as u32 >> 4) & 0b1) + 4,
            ((MASK as u32 >> 5) & 0b1) + 12,
            ((MASK as u32 >> 6) & 0b1) + 6,
            ((MASK as u32 >> 7) & 0b1) + 14,
        ],
    )
}

/// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_pd&expand=5190)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vshufpd, MASK = 3))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask_shuffle_pd<const MASK: i32>(
    src: __m512d,
    k: __mmask8,
    a: __m512d,
    b: __m512d,
) -> __m512d {
    static_assert_imm8!(MASK);
    let r = _mm512_shuffle_pd::<MASK>(a, b);
    transmute(simd_select_bitmask(k, r.as_f64x8(), src.as_f64x8()))
}

/// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_pd&expand=5191)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vshufpd, MASK = 3))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_maskz_shuffle_pd<const MASK: i32>(
    k: __mmask8,
    a: __m512d,
    b: __m512d,
) -> __m512d {
    static_assert_imm8!(MASK);
    let r = _mm512_shuffle_pd::<MASK>(a, b);
    let zero = _mm512_setzero_pd().as_f64x8();
    transmute(simd_select_bitmask(k, r.as_f64x8(), zero))
}

/// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shuffle_pd&expand=5187)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vshufpd, MASK = 3))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm256_mask_shuffle_pd<const MASK: i32>(
    src: __m256d,
    k: __mmask8,
    a: __m256d,
    b: __m256d,
) -> __m256d {
    static_assert_imm8!(MASK);
    let r = _mm256_shuffle_pd::<MASK>(a, b);
    transmute(simd_select_bitmask(k, r.as_f64x4(), src.as_f64x4()))
}

/// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shuffle_pd&expand=5188)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vshufpd, MASK = 3))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm256_maskz_shuffle_pd<const MASK: i32>(
    k: __mmask8,
    a: __m256d,
    b: __m256d,
) -> __m256d {
    static_assert_imm8!(MASK);
    let r = _mm256_shuffle_pd::<MASK>(a, b);
    let zero = _mm256_setzero_pd().as_f64x4();
    transmute(simd_select_bitmask(k, r.as_f64x4(), zero))
}

/// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_shuffle_pd&expand=5184)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vshufpd, MASK = 1))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_mask_shuffle_pd<const MASK: i32>(
    src: __m128d,
    k: __mmask8,
    a: __m128d,
    b: __m128d,
) -> __m128d {
    static_assert_imm8!(MASK);
    let r = _mm_shuffle_pd::<MASK>(a, b);
    transmute(simd_select_bitmask(k, r.as_f64x2(), src.as_f64x2()))
}

/// Shuffle double-precision (64-bit) floating-point elements within 128-bit lanes using the control in imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_shuffle_pd&expand=5185)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vshufpd, MASK = 1))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm_maskz_shuffle_pd<const MASK: i32>(
    k: __mmask8,
    a: __m128d,
    b: __m128d,
) -> __m128d {
    static_assert_imm8!(MASK);
    let r = _mm_shuffle_pd::<MASK>(a, b);
    let zero = _mm_setzero_pd().as_f64x2();
    transmute(simd_select_bitmask(k, r.as_f64x2(), zero))
}

/// Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shuffle_i32&expand=5177)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vshufi64x2, MASK = 0b10_01_01_01))] //should be vshufi32x4
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_shuffle_i32x4<const MASK: i32>(a: __m512i, b: __m512i) -> __m512i {
    static_assert_imm8!(MASK);
    let a = a.as_i32x16();
    let b = b.as_i32x16();
    let r: i32x16 = simd_shuffle16!(
        a,
        b,
        <const MASK: i32> [
            (MASK as u32 & 0b11) * 4 + 0,
            (MASK as u32 & 0b11) * 4 + 1,
            (MASK as u32 & 0b11) * 4 + 2,
            (MASK as u32 & 0b11) * 4 + 3,
            ((MASK as u32 >> 2) & 0b11) * 4 + 0,
            ((MASK as u32 >> 2) & 0b11) * 4 + 1,
            ((MASK as u32 >> 2) & 0b11) * 4 + 2,
            ((MASK as u32 >> 2) & 0b11) * 4 + 3,
            ((MASK as u32 >> 4) & 0b11) * 4 + 0 + 16,
            ((MASK as u32 >> 4) & 0b11) * 4 + 1 + 16,
            ((MASK as u32 >> 4) & 0b11) * 4 + 2 + 16,
            ((MASK as u32 >> 4) & 0b11) * 4 + 3 + 16,
            ((MASK as u32 >> 6) & 0b11) * 4 + 0 + 16,
            ((MASK as u32 >> 6) & 0b11) * 4 + 1 + 16,
            ((MASK as u32 >> 6) & 0b11) * 4 + 2 + 16,
            ((MASK as u32 >> 6) & 0b11) * 4 + 3 + 16,
        ],
    );
    transmute(r)
}

/// Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_i32x&expand=5175)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vshufi32x4, MASK = 0b10_11_01_01))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask_shuffle_i32x4<const MASK: i32>(
    src: __m512i,
    k: __mmask16,
    a: __m512i,
    b: __m512i,
) -> __m512i {
    static_assert_imm8!(MASK);
    let r = _mm512_shuffle_i32x4::<MASK>(a, b);
    transmute(simd_select_bitmask(k, r.as_i32x16(), src.as_i32x16()))
}

/// Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_i32&expand=5176)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vshufi32x4, MASK = 0b10_11_01_01))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_maskz_shuffle_i32x4<const MASK: i32>(
    k: __mmask16,
    a: __m512i,
    b: __m512i,
) -> __m512i {
    static_assert_imm8!(MASK);
    let r = _mm512_shuffle_i32x4::<MASK>(a, b);
    let zero = _mm512_setzero_si512().as_i32x16();
    transmute(simd_select_bitmask(k, r.as_i32x16(), zero))
}

/// Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shuffle_i32x4&expand=5174)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vperm, MASK = 0b11))] //should be vshufi32x4
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm256_shuffle_i32x4<const MASK: i32>(a: __m256i, b: __m256i) -> __m256i {
    static_assert_imm8!(MASK);
    let a = a.as_i32x8();
    let b = b.as_i32x8();
    let r: i32x8 = simd_shuffle8!(
        a,
        b,
        <const MASK: i32> [
            (MASK as u32 & 0b1) * 4 + 0,
            (MASK as u32 & 0b1) * 4 + 1,
            (MASK as u32 & 0b1) * 4 + 2,
            (MASK as u32 & 0b1) * 4 + 3,
            ((MASK as u32 >> 1) & 0b1) * 4 + 0 + 8,
            ((MASK as u32 >> 1) & 0b1) * 4 + 1 + 8,
            ((MASK as u32 >> 1) & 0b1) * 4 + 2 + 8,
            ((MASK as u32 >> 1) & 0b1) * 4 + 3 + 8,
        ],
    );
    transmute(r)
}

/// Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shuffle_i32x4&expand=5172)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vshufi32x4, MASK = 0b11))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm256_mask_shuffle_i32x4<const MASK: i32>(
    src: __m256i,
    k: __mmask8,
    a: __m256i,
    b: __m256i,
) -> __m256i {
    static_assert_imm8!(MASK);
    let r = _mm256_shuffle_i32x4::<MASK>(a, b);
    transmute(simd_select_bitmask(k, r.as_i32x8(), src.as_i32x8()))
}

/// Shuffle 128-bits (composed of 4 32-bit integers) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shuffle_i32x4&expand=5173)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vshufi32x4, MASK = 0b11))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm256_maskz_shuffle_i32x4<const MASK: i32>(
    k: __mmask8,
    a: __m256i,
    b: __m256i,
) -> __m256i {
    static_assert_imm8!(MASK);
    let r = _mm256_shuffle_i32x4::<MASK>(a, b);
    let zero = _mm256_setzero_si256().as_i32x8();
    transmute(simd_select_bitmask(k, r.as_i32x8(), zero))
}

/// Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shuffle_i64x2&expand=5183)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vshufi64x2, MASK = 0b10_11_11_11))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_shuffle_i64x2<const MASK: i32>(a: __m512i, b: __m512i) -> __m512i {
    static_assert_imm8!(MASK);
    let a = a.as_i64x8();
    let b = b.as_i64x8();
    let r: i64x8 = simd_shuffle8!(
        a,
        b,
        <const MASK: i32> [
            (MASK as u32 & 0b11) * 2 + 0,
            (MASK as u32 & 0b11) * 2 + 1,
            ((MASK as u32 >> 2) & 0b11) * 2 + 0,
            ((MASK as u32 >> 2) & 0b11) * 2 + 1,
            ((MASK as u32 >> 4) & 0b11) * 2 + 0 + 8,
            ((MASK as u32 >> 4) & 0b11) * 2 + 1 + 8,
            ((MASK as u32 >> 6) & 0b11) * 2 + 0 + 8,
            ((MASK as u32 >> 6) & 0b11) * 2 + 1 + 8,
        ],
    );
    transmute(r)
}

/// Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_i64x&expand=5181)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vshufi64x2, MASK = 0b10_11_11_11))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask_shuffle_i64x2<const MASK: i32>(
    src: __m512i,
    k: __mmask8,
    a: __m512i,
    b: __m512i,
) -> __m512i {
    static_assert_imm8!(MASK);
    let r = _mm512_shuffle_i64x2::<MASK>(a, b);
    transmute(simd_select_bitmask(k, r.as_i64x8(), src.as_i64x8()))
}

/// Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_i64&expand=5182)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vshufi64x2, MASK = 0b10_11_11_11))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_maskz_shuffle_i64x2<const MASK: i32>(
    k: __mmask8,
    a: __m512i,
    b: __m512i,
) -> __m512i {
    static_assert_imm8!(MASK);
    let r = _mm512_shuffle_i64x2::<MASK>(a, b);
    let zero = _mm512_setzero_si512().as_i64x8();
    transmute(simd_select_bitmask(k, r.as_i64x8(), zero))
}

/// Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shuffle_i64x2&expand=5180)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vperm, MASK = 0b01))] //should be vshufi64x2
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm256_shuffle_i64x2<const MASK: i32>(a: __m256i, b: __m256i) -> __m256i {
    static_assert_imm8!(MASK);
    let a = a.as_i64x4();
    let b = b.as_i64x4();
    let r: i64x4 = simd_shuffle4!(
        a,
        b,
        <const MASK: i32> [
            (MASK as u32 & 0b1) * 2 + 0,
            (MASK as u32 & 0b1) * 2 + 1,
            ((MASK as u32 >> 1) & 0b1) * 2 + 0 + 4,
            ((MASK as u32 >> 1) & 0b1) * 2 + 1 + 4,
        ],
    );
    transmute(r)
}

/// Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shuffle_i64x2&expand=5178)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vshufi64x2, MASK = 0b11))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm256_mask_shuffle_i64x2<const MASK: i32>(
    src: __m256i,
    k: __mmask8,
    a: __m256i,
    b: __m256i,
) -> __m256i {
    static_assert_imm8!(MASK);
    let r = _mm256_shuffle_i64x2::<MASK>(a, b);
    transmute(simd_select_bitmask(k, r.as_i64x4(), src.as_i64x4()))
}

/// Shuffle 128-bits (composed of 2 64-bit integers) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shuffle_i64x2&expand=5179)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vshufi64x2, MASK = 0b11))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm256_maskz_shuffle_i64x2<const MASK: i32>(
    k: __mmask8,
    a: __m256i,
    b: __m256i,
) -> __m256i {
    static_assert_imm8!(MASK);
    let r = _mm256_shuffle_i64x2::<MASK>(a, b);
    let zero = _mm256_setzero_si256().as_i64x4();
    transmute(simd_select_bitmask(k, r.as_i64x4(), zero))
}

/// Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shuffle_f32x4&expand=5165)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vshuff64x2, MASK = 0b1011))] //should be vshuff32x4, but generate vshuff64x2
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_shuffle_f32x4<const MASK: i32>(a: __m512, b: __m512) -> __m512 {
    static_assert_imm8!(MASK);
    let a = a.as_f32x16();
    let b = b.as_f32x16();
    let r: f32x16 = simd_shuffle16!(
        a,
        b,
        <const MASK: i32> [
            (MASK as u32 & 0b11) * 4 + 0,
            (MASK as u32 & 0b11) * 4 + 1,
            (MASK as u32 & 0b11) * 4 + 2,
            (MASK as u32 & 0b11) * 4 + 3,
            ((MASK as u32 >> 2) & 0b11) * 4 + 0,
            ((MASK as u32 >> 2) & 0b11) * 4 + 1,
            ((MASK as u32 >> 2) & 0b11) * 4 + 2,
            ((MASK as u32 >> 2) & 0b11) * 4 + 3,
            ((MASK as u32 >> 4) & 0b11) * 4 + 0 + 16,
            ((MASK as u32 >> 4) & 0b11) * 4 + 1 + 16,
            ((MASK as u32 >> 4) & 0b11) * 4 + 2 + 16,
            ((MASK as u32 >> 4) & 0b11) * 4 + 3 + 16,
            ((MASK as u32 >> 6) & 0b11) * 4 + 0 + 16,
            ((MASK as u32 >> 6) & 0b11) * 4 + 1 + 16,
            ((MASK as u32 >> 6) & 0b11) * 4 + 2 + 16,
            ((MASK as u32 >> 6) & 0b11) * 4 + 3 + 16,
        ],
    );
    transmute(r)
}

/// Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_f32&expand=5163)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vshuff32x4, MASK = 0b1011))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask_shuffle_f32x4<const MASK: i32>(
    src: __m512,
    k: __mmask16,
    a: __m512,
    b: __m512,
) -> __m512 {
    static_assert_imm8!(MASK);
    let r = _mm512_shuffle_f32x4::<MASK>(a, b);
    transmute(simd_select_bitmask(k, r.as_f32x16(), src.as_f32x16()))
}

/// Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_f32&expand=5164)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vshuff32x4, MASK = 0b1011))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_maskz_shuffle_f32x4<const MASK: i32>(
    k: __mmask16,
    a: __m512,
    b: __m512,
) -> __m512 {
    static_assert_imm8!(MASK);
    let r = _mm512_shuffle_f32x4::<MASK>(a, b);
    let zero = _mm512_setzero_ps().as_f32x16();
    transmute(simd_select_bitmask(k, r.as_f32x16(), zero))
}

/// Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shuffle_f32x4&expand=5162)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vperm, MASK = 0b01))] //should be vshuff32x4
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm256_shuffle_f32x4<const MASK: i32>(a: __m256, b: __m256) -> __m256 {
    static_assert_imm8!(MASK);
    let a = a.as_f32x8();
    let b = b.as_f32x8();
    let r: f32x8 = simd_shuffle8!(
        a,
        b,
        <const MASK: i32> [
            (MASK as u32 & 0b1) * 4 + 0,
            (MASK as u32 & 0b1) * 4 + 1,
            (MASK as u32 & 0b1) * 4 + 2,
            (MASK as u32 & 0b1) * 4 + 3,
            ((MASK as u32 >> 1) & 0b1) * 4 + 0 + 8,
            ((MASK as u32 >> 1) & 0b1) * 4 + 1 + 8,
            ((MASK as u32 >> 1) & 0b1) * 4 + 2 + 8,
            ((MASK as u32 >> 1) & 0b1) * 4 + 3 + 8,
        ],
    );
    transmute(r)
}

/// Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shuffle_f32x4&expand=5160)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vshuff32x4, MASK = 0b11))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm256_mask_shuffle_f32x4<const MASK: i32>(
    src: __m256,
    k: __mmask8,
    a: __m256,
    b: __m256,
) -> __m256 {
    static_assert_imm8!(MASK);
    let r = _mm256_shuffle_f32x4::<MASK>(a, b);
    transmute(simd_select_bitmask(k, r.as_f32x8(), src.as_f32x8()))
}

/// Shuffle 128-bits (composed of 4 single-precision (32-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shuffle_f32x4&expand=5161)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vshuff32x4, MASK = 0b11))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm256_maskz_shuffle_f32x4<const MASK: i32>(
    k: __mmask8,
    a: __m256,
    b: __m256,
) -> __m256 {
    static_assert_imm8!(MASK);
    let r = _mm256_shuffle_f32x4::<MASK>(a, b);
    let zero = _mm256_setzero_ps().as_f32x8();
    transmute(simd_select_bitmask(k, r.as_f32x8(), zero))
}

/// Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_shuffle_f64x2&expand=5171)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vshuff64x2, MASK = 0b10_11_11_11))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_shuffle_f64x2<const MASK: i32>(a: __m512d, b: __m512d) -> __m512d {
    static_assert_imm8!(MASK);
    let a = a.as_f64x8();
    let b = b.as_f64x8();
    let r: f64x8 = simd_shuffle8!(
        a,
        b,
        <const MASK: i32> [
            (MASK as u32 & 0b11) * 2 + 0,
            (MASK as u32 & 0b11) * 2 + 1,
            ((MASK as u32 >> 2) & 0b11) * 2 + 0,
            ((MASK as u32 >> 2) & 0b11) * 2 + 1,
            ((MASK as u32 >> 4) & 0b11) * 2 + 0 + 8,
            ((MASK as u32 >> 4) & 0b11) * 2 + 1 + 8,
            ((MASK as u32 >> 6) & 0b11) * 2 + 0 + 8,
            ((MASK as u32 >> 6) & 0b11) * 2 + 1 + 8,
        ],
    );
    transmute(r)
}

/// Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_shuffle_f64x2&expand=5169)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vshuff64x2, MASK = 0b10_11_11_11))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask_shuffle_f64x2<const MASK: i32>(
    src: __m512d,
    k: __mmask8,
    a: __m512d,
    b: __m512d,
) -> __m512d {
    static_assert_imm8!(MASK);
    let r = _mm512_shuffle_f64x2::<MASK>(a, b);
    transmute(simd_select_bitmask(k, r.as_f64x8(), src.as_f64x8()))
}

/// Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_shuffle_f64x2&expand=5170)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vshuff64x2, MASK = 0b10_11_11_11))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_maskz_shuffle_f64x2<const MASK: i32>(
    k: __mmask8,
    a: __m512d,
    b: __m512d,
) -> __m512d {
    static_assert_imm8!(MASK);
    let r = _mm512_shuffle_f64x2::<MASK>(a, b);
    let zero = _mm512_setzero_pd().as_f64x8();
    transmute(simd_select_bitmask(k, r.as_f64x8(), zero))
}

/// Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_shuffle_f64x2&expand=5168)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vperm, MASK = 0b01))] //should be vshuff64x2
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm256_shuffle_f64x2<const MASK: i32>(a: __m256d, b: __m256d) -> __m256d {
    static_assert_imm8!(MASK);
    let a = a.as_f64x4();
    let b = b.as_f64x4();
    let r: f64x4 = simd_shuffle4!(
        a,
        b,
        <const MASK: i32> [
            (MASK as u32 & 0b1) * 2 + 0,
            (MASK as u32 & 0b1) * 2 + 1,
            ((MASK as u32 >> 1) & 0b1) * 2 + 0 + 4,
            ((MASK as u32 >> 1) & 0b1) * 2 + 1 + 4,
        ],
    );
    transmute(r)
}

/// Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_shuffle_f64x2&expand=5166)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vshuff64x2, MASK = 0b11))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm256_mask_shuffle_f64x2<const MASK: i32>(
    src: __m256d,
    k: __mmask8,
    a: __m256d,
    b: __m256d,
) -> __m256d {
    static_assert_imm8!(MASK);
    let r = _mm256_shuffle_f64x2::<MASK>(a, b);
    transmute(simd_select_bitmask(k, r.as_f64x4(), src.as_f64x4()))
}

/// Shuffle 128-bits (composed of 2 double-precision (64-bit) floating-point elements) selected by imm8 from a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_shuffle_f64x2&expand=5167)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vshuff64x2, MASK = 0b11))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm256_maskz_shuffle_f64x2<const MASK: i32>(
    k: __mmask8,
    a: __m256d,
    b: __m256d,
) -> __m256d {
    static_assert_imm8!(MASK);
    let r = _mm256_shuffle_f64x2::<MASK>(a, b);
    let zero = _mm256_setzero_pd().as_f64x4();
    transmute(simd_select_bitmask(k, r.as_f64x4(), zero))
}

/// Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the result in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_extractf32x4_ps&expand=2442)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(
    all(test, not(target_os = "windows")),
    assert_instr(vextractf32x4, IMM8 = 3)
)]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm512_extractf32x4_ps<const IMM8: i32>(a: __m512) -> __m128 {
    static_assert_imm2!(IMM8);
    match IMM8 & 0x3 {
        0 => simd_shuffle4!(a, _mm512_undefined_ps(), [0, 1, 2, 3]),
        1 => simd_shuffle4!(a, _mm512_undefined_ps(), [4, 5, 6, 7]),
        2 => simd_shuffle4!(a, _mm512_undefined_ps(), [8, 9, 10, 11]),
        _ => simd_shuffle4!(a, _mm512_undefined_ps(), [12, 13, 14, 15]),
    }
}

/// Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_extractf32x4_ps&expand=2443)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(
    all(test, not(target_os = "windows")),
    assert_instr(vextractf32x4, IMM8 = 3)
)]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_mask_extractf32x4_ps<const IMM8: i32>(
    src: __m128,
    k: __mmask8,
    a: __m512,
) -> __m128 {
    static_assert_imm2!(IMM8);
    let r = _mm512_extractf32x4_ps::<IMM8>(a);
    transmute(simd_select_bitmask(k, r.as_f32x4(), src.as_f32x4()))
}

/// Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_extractf32x4_ps&expand=2444)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(
    all(test, not(target_os = "windows")),
    assert_instr(vextractf32x4, IMM8 = 3)
)]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_maskz_extractf32x4_ps<const IMM8: i32>(k: __mmask8, a: __m512) -> __m128 {
    static_assert_imm2!(IMM8);
    let r = _mm512_extractf32x4_ps::<IMM8>(a);
    let zero = _mm_setzero_ps().as_f32x4();
    transmute(simd_select_bitmask(k, r.as_f32x4(), zero))
}

/// Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the result in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extractf32x4_ps&expand=2439)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(
    all(test, not(target_os = "windows")),
    assert_instr(vextract, IMM8 = 1) //should be vextractf32x4
)]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm256_extractf32x4_ps<const IMM8: i32>(a: __m256) -> __m128 {
    static_assert_imm1!(IMM8);
    match IMM8 & 0x1 {
        0 => simd_shuffle4!(a, _mm256_undefined_ps(), [0, 1, 2, 3]),
        _ => simd_shuffle4!(a, _mm256_undefined_ps(), [4, 5, 6, 7]),
    }
}

/// Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_extractf32x4_ps&expand=2440)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(
    all(test, not(target_os = "windows")),
    assert_instr(vextractf32x4, IMM8 = 1)
)]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm256_mask_extractf32x4_ps<const IMM8: i32>(
    src: __m128,
    k: __mmask8,
    a: __m256,
) -> __m128 {
    static_assert_imm1!(IMM8);
    let r = _mm256_extractf32x4_ps::<IMM8>(a);
    transmute(simd_select_bitmask(k, r.as_f32x4(), src.as_f32x4()))
}

/// Extract 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from a, selected with imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_extractf32x4_ps&expand=2441)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(
    all(test, not(target_os = "windows")),
    assert_instr(vextractf32x4, IMM8 = 1)
)]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm256_maskz_extractf32x4_ps<const IMM8: i32>(k: __mmask8, a: __m256) -> __m128 {
    static_assert_imm1!(IMM8);
    let r = _mm256_extractf32x4_ps::<IMM8>(a);
    let zero = _mm_setzero_ps().as_f32x4();
    transmute(simd_select_bitmask(k, r.as_f32x4(), zero))
}

/// Extract 256 bits (composed of 4 packed 64-bit integers) from a, selected with IMM1, and store the result in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_extracti64x4_epi64&expand=2473)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(
    all(test, not(target_os = "windows")),
    assert_instr(vextractf64x4, IMM1 = 1) //should be vextracti64x4
)]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm512_extracti64x4_epi64<const IMM1: i32>(a: __m512i) -> __m256i {
    static_assert_imm1!(IMM1);
    match IMM1 {
        0 => simd_shuffle4!(a, _mm512_set1_epi64(0), [0, 1, 2, 3]),
        _ => simd_shuffle4!(a, _mm512_set1_epi64(0), [4, 5, 6, 7]),
    }
}

/// Extract 256 bits (composed of 4 packed 64-bit integers) from a, selected with IMM1, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_extracti64x4_epi64&expand=2474)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(
    all(test, not(target_os = "windows")),
    assert_instr(vextracti64x4, IMM1 = 1)
)]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_mask_extracti64x4_epi64<const IMM1: i32>(
    src: __m256i,
    k: __mmask8,
    a: __m512i,
) -> __m256i {
    static_assert_imm1!(IMM1);
    let r = _mm512_extracti64x4_epi64::<IMM1>(a);
    transmute(simd_select_bitmask(k, r.as_i64x4(), src.as_i64x4()))
}

/// Extract 256 bits (composed of 4 packed 64-bit integers) from a, selected with IMM1, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_extracti64x4_epi64&expand=2475)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(
    all(test, not(target_os = "windows")),
    assert_instr(vextracti64x4, IMM1 = 1)
)]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_maskz_extracti64x4_epi64<const IMM1: i32>(k: __mmask8, a: __m512i) -> __m256i {
    static_assert_imm1!(IMM1);
    let r = _mm512_extracti64x4_epi64::<IMM1>(a);
    let zero = _mm256_setzero_si256().as_i64x4();
    transmute(simd_select_bitmask(k, r.as_i64x4(), zero))
}

/// Extract 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from a, selected with imm8, and store the result in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_extractf64x4_pd&expand=2454)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(
    all(test, not(target_os = "windows")),
    assert_instr(vextractf64x4, IMM8 = 1)
)]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm512_extractf64x4_pd<const IMM8: i32>(a: __m512d) -> __m256d {
    static_assert_imm1!(IMM8);
    match IMM8 & 0x1 {
        0 => simd_shuffle4!(a, _mm512_undefined_pd(), [0, 1, 2, 3]),
        _ => simd_shuffle4!(a, _mm512_undefined_pd(), [4, 5, 6, 7]),
    }
}

/// Extract 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from a, selected with imm8, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_extractf64x4_pd&expand=2455)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(
    all(test, not(target_os = "windows")),
    assert_instr(vextractf64x4, IMM8 = 1)
)]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_mask_extractf64x4_pd<const IMM8: i32>(
    src: __m256d,
    k: __mmask8,
    a: __m512d,
) -> __m256d {
    static_assert_imm1!(IMM8);
    let r = _mm512_extractf64x4_pd::<IMM8>(a);
    transmute(simd_select_bitmask(k, r.as_f64x4(), src.as_f64x4()))
}

/// Extract 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from a, selected with imm8, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_extractf64x4_pd&expand=2456)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(
    all(test, not(target_os = "windows")),
    assert_instr(vextractf64x4, IMM8 = 1)
)]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_maskz_extractf64x4_pd<const IMM8: i32>(k: __mmask8, a: __m512d) -> __m256d {
    static_assert_imm1!(IMM8);
    let r = _mm512_extractf64x4_pd::<IMM8>(a);
    let zero = _mm256_setzero_pd().as_f64x4();
    transmute(simd_select_bitmask(k, r.as_f64x4(), zero))
}

/// Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with IMM2, and store the result in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_extracti32x4_epi32&expand=2461)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(
    all(test, not(target_os = "windows")),
    assert_instr(vextractf32x4, IMM2 = 3) //should be vextracti32x4
)]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm512_extracti32x4_epi32<const IMM2: i32>(a: __m512i) -> __m128i {
    static_assert_imm2!(IMM2);
    let a = a.as_i32x16();
    let undefined = _mm512_undefined_epi32().as_i32x16();
    let extract: i32x4 = match IMM2 {
        0 => simd_shuffle4!(a, undefined, [0, 1, 2, 3]),
        1 => simd_shuffle4!(a, undefined, [4, 5, 6, 7]),
        2 => simd_shuffle4!(a, undefined, [8, 9, 10, 11]),
        _ => simd_shuffle4!(a, undefined, [12, 13, 14, 15]),
    };
    transmute(extract)
}

/// Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with IMM2, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_extracti32x4_epi32&expand=2462)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(
    all(test, not(target_os = "windows")),
    assert_instr(vextracti32x4, IMM2 = 3)
)]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_mask_extracti32x4_epi32<const IMM2: i32>(
    src: __m128i,
    k: __mmask8,
    a: __m512i,
) -> __m128i {
    static_assert_imm2!(IMM2);
    let r = _mm512_extracti32x4_epi32::<IMM2>(a);
    transmute(simd_select_bitmask(k, r.as_i32x4(), src.as_i32x4()))
}

/// Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with IMM2, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_extracti32x4_epi32&expand=2463)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(
    all(test, not(target_os = "windows")),
    assert_instr(vextracti32x4, IMM2 = 3)
)]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_maskz_extracti32x4_epi32<const IMM2: i32>(k: __mmask8, a: __m512i) -> __m128i {
    static_assert_imm2!(IMM2);
    let r = _mm512_extracti32x4_epi32::<IMM2>(a);
    let zero = _mm_setzero_si128().as_i32x4();
    transmute(simd_select_bitmask(k, r.as_i32x4(), zero))
}

/// Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with IMM1, and store the result in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_extracti32x4_epi32&expand=2458)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(
    all(test, not(target_os = "windows")),
    assert_instr(vextract, IMM1 = 1) //should be vextracti32x4
)]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm256_extracti32x4_epi32<const IMM1: i32>(a: __m256i) -> __m128i {
    static_assert_imm1!(IMM1);
    let a = a.as_i32x8();
    let undefined = _mm256_undefined_si256().as_i32x8();
    let extract: i32x4 = match IMM1 {
        0 => simd_shuffle4!(a, undefined, [0, 1, 2, 3]),
        _ => simd_shuffle4!(a, undefined, [4, 5, 6, 7]),
    };
    transmute(extract)
}

/// Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with IMM1, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_extracti32x4_epi32&expand=2459)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(
    all(test, not(target_os = "windows")),
    assert_instr(vextracti32x4, IMM1 = 1)
)]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm256_mask_extracti32x4_epi32<const IMM1: i32>(
    src: __m128i,
    k: __mmask8,
    a: __m256i,
) -> __m128i {
    static_assert_imm1!(IMM1);
    let r = _mm256_extracti32x4_epi32::<IMM1>(a);
    transmute(simd_select_bitmask(k, r.as_i32x4(), src.as_i32x4()))
}

/// Extract 128 bits (composed of 4 packed 32-bit integers) from a, selected with IMM1, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_extracti32x4_epi32&expand=2460)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(
    all(test, not(target_os = "windows")),
    assert_instr(vextracti32x4, IMM1 = 1)
)]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm256_maskz_extracti32x4_epi32<const IMM1: i32>(k: __mmask8, a: __m256i) -> __m128i {
    static_assert_imm1!(IMM1);
    let r = _mm256_extracti32x4_epi32::<IMM1>(a);
    let zero = _mm_setzero_si128().as_i32x4();
    transmute(simd_select_bitmask(k, r.as_i32x4(), zero))
}

/// Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_moveldup_ps&expand=3862)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmovsldup))]
pub unsafe fn _mm512_moveldup_ps(a: __m512) -> __m512 {
    let r: f32x16 = simd_shuffle16!(a, a, [0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14]);
    transmute(r)
}

/// Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_moveldup_ps&expand=3860)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmovsldup))]
pub unsafe fn _mm512_mask_moveldup_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
    let mov: f32x16 = simd_shuffle16!(a, a, [0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14]);
    transmute(simd_select_bitmask(k, mov, src.as_f32x16()))
}

/// Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_moveldup_ps&expand=3861)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmovsldup))]
pub unsafe fn _mm512_maskz_moveldup_ps(k: __mmask16, a: __m512) -> __m512 {
    let mov: f32x16 = simd_shuffle16!(a, a, [0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14]);
    let zero = _mm512_setzero_ps().as_f32x16();
    transmute(simd_select_bitmask(k, mov, zero))
}

/// Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_moveldup_ps&expand=3857)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmovsldup))]
pub unsafe fn _mm256_mask_moveldup_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
    let mov = _mm256_moveldup_ps(a);
    transmute(simd_select_bitmask(k, mov.as_f32x8(), src.as_f32x8()))
}

/// Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_moveldup_ps&expand=3858)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmovsldup))]
pub unsafe fn _mm256_maskz_moveldup_ps(k: __mmask8, a: __m256) -> __m256 {
    let mov = _mm256_moveldup_ps(a);
    let zero = _mm256_setzero_ps().as_f32x8();
    transmute(simd_select_bitmask(k, mov.as_f32x8(), zero))
}

/// Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_moveldup_ps&expand=3854)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmovsldup))]
pub unsafe fn _mm_mask_moveldup_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
    let mov = _mm_moveldup_ps(a);
    transmute(simd_select_bitmask(k, mov.as_f32x4(), src.as_f32x4()))
}

/// Duplicate even-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_moveldup_ps&expand=3855)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmovsldup))]
pub unsafe fn _mm_maskz_moveldup_ps(k: __mmask8, a: __m128) -> __m128 {
    let mov = _mm_moveldup_ps(a);
    let zero = _mm_setzero_ps().as_f32x4();
    transmute(simd_select_bitmask(k, mov.as_f32x4(), zero))
}

/// Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_movehdup_ps&expand=3852)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmovshdup))]
pub unsafe fn _mm512_movehdup_ps(a: __m512) -> __m512 {
    let r: f32x16 = simd_shuffle16!(a, a, [1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15]);
    transmute(r)
}

/// Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_movehdup&expand=3850)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmovshdup))]
pub unsafe fn _mm512_mask_movehdup_ps(src: __m512, k: __mmask16, a: __m512) -> __m512 {
    let mov: f32x16 = simd_shuffle16!(a, a, [1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15]);
    transmute(simd_select_bitmask(k, mov, src.as_f32x16()))
}

/// Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_moveh&expand=3851)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmovshdup))]
pub unsafe fn _mm512_maskz_movehdup_ps(k: __mmask16, a: __m512) -> __m512 {
    let mov: f32x16 = simd_shuffle16!(a, a, [1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15]);
    let zero = _mm512_setzero_ps().as_f32x16();
    transmute(simd_select_bitmask(k, mov, zero))
}

/// Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_movehdup_ps&expand=3847)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmovshdup))]
pub unsafe fn _mm256_mask_movehdup_ps(src: __m256, k: __mmask8, a: __m256) -> __m256 {
    let mov = _mm256_movehdup_ps(a);
    transmute(simd_select_bitmask(k, mov.as_f32x8(), src.as_f32x8()))
}

/// Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_movehdup_ps&expand=3848)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmovshdup))]
pub unsafe fn _mm256_maskz_movehdup_ps(k: __mmask8, a: __m256) -> __m256 {
    let mov = _mm256_movehdup_ps(a);
    let zero = _mm256_setzero_ps().as_f32x8();
    transmute(simd_select_bitmask(k, mov.as_f32x8(), zero))
}

/// Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_movehdup_ps&expand=3844)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmovshdup))]
pub unsafe fn _mm_mask_movehdup_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
    let mov = _mm_movehdup_ps(a);
    transmute(simd_select_bitmask(k, mov.as_f32x4(), src.as_f32x4()))
}

/// Duplicate odd-indexed single-precision (32-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_movehdup_ps&expand=3845)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmovshdup))]
pub unsafe fn _mm_maskz_movehdup_ps(k: __mmask8, a: __m128) -> __m128 {
    let mov = _mm_movehdup_ps(a);
    let zero = _mm_setzero_ps().as_f32x4();
    transmute(simd_select_bitmask(k, mov.as_f32x4(), zero))
}

/// Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_movedup_pd&expand=3843)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmovddup))]
pub unsafe fn _mm512_movedup_pd(a: __m512d) -> __m512d {
    let r: f64x8 = simd_shuffle8!(a, a, [0, 0, 2, 2, 4, 4, 6, 6]);
    transmute(r)
}

/// Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_movedup_pd&expand=3841)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmovddup))]
pub unsafe fn _mm512_mask_movedup_pd(src: __m512d, k: __mmask8, a: __m512d) -> __m512d {
    let mov: f64x8 = simd_shuffle8!(a, a, [0, 0, 2, 2, 4, 4, 6, 6]);
    transmute(simd_select_bitmask(k, mov, src.as_f64x8()))
}

/// Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_movedup_pd&expand=3842)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmovddup))]
pub unsafe fn _mm512_maskz_movedup_pd(k: __mmask8, a: __m512d) -> __m512d {
    let mov: f64x8 = simd_shuffle8!(a, a, [0, 0, 2, 2, 4, 4, 6, 6]);
    let zero = _mm512_setzero_pd().as_f64x8();
    transmute(simd_select_bitmask(k, mov, zero))
}

/// Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_movedup_pd&expand=3838)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmovddup))]
pub unsafe fn _mm256_mask_movedup_pd(src: __m256d, k: __mmask8, a: __m256d) -> __m256d {
    let mov = _mm256_movedup_pd(a);
    transmute(simd_select_bitmask(k, mov.as_f64x4(), src.as_f64x4()))
}

/// Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_movedup_pd&expand=3839)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmovddup))]
pub unsafe fn _mm256_maskz_movedup_pd(k: __mmask8, a: __m256d) -> __m256d {
    let mov = _mm256_movedup_pd(a);
    let zero = _mm256_setzero_pd().as_f64x4();
    transmute(simd_select_bitmask(k, mov.as_f64x4(), zero))
}

/// Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_movedup_pd&expand=3835)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmovddup))]
pub unsafe fn _mm_mask_movedup_pd(src: __m128d, k: __mmask8, a: __m128d) -> __m128d {
    let mov = _mm_movedup_pd(a);
    transmute(simd_select_bitmask(k, mov.as_f64x2(), src.as_f64x2()))
}

/// Duplicate even-indexed double-precision (64-bit) floating-point elements from a, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_movedup_pd&expand=3836)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmovddup))]
pub unsafe fn _mm_maskz_movedup_pd(k: __mmask8, a: __m128d) -> __m128d {
    let mov = _mm_movedup_pd(a);
    let zero = _mm_setzero_pd().as_f64x2();
    transmute(simd_select_bitmask(k, mov.as_f64x2(), zero))
}

/// Copy a to dst, then insert 128 bits (composed of 4 packed 32-bit integers) from b into dst at the location specified by imm8.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_inserti32x4&expand=3174)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vinsertf32x4, IMM8 = 2))] //should be vinserti32x4
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_inserti32x4<const IMM8: i32>(a: __m512i, b: __m128i) -> __m512i {
    static_assert_imm2!(IMM8);
    let a = a.as_i32x16();
    let b = _mm512_castsi128_si512(b).as_i32x16();
    let ret: i32x16 = match IMM8 & 0b11 {
        0 => simd_shuffle16!(
            a,
            b,
            [16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
        ),
        1 => simd_shuffle16!(
            a,
            b,
            [0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 12, 13, 14, 15],
        ),
        2 => simd_shuffle16!(
            a,
            b,
            [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15],
        ),
        _ => simd_shuffle16!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19]),
    };
    transmute(ret)
}

/// Copy a to tmp, then insert 128 bits (composed of 4 packed 32-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_inserti32x4&expand=3175)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vinserti32x4, IMM8 = 2))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask_inserti32x4<const IMM8: i32>(
    src: __m512i,
    k: __mmask16,
    a: __m512i,
    b: __m128i,
) -> __m512i {
    static_assert_imm2!(IMM8);
    let r = _mm512_inserti32x4::<IMM8>(a, b);
    transmute(simd_select_bitmask(k, r.as_i32x16(), src.as_i32x16()))
}

/// Copy a to tmp, then insert 128 bits (composed of 4 packed 32-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_inserti32x4&expand=3176)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vinserti32x4, IMM8 = 2))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_maskz_inserti32x4<const IMM8: i32>(
    k: __mmask16,
    a: __m512i,
    b: __m128i,
) -> __m512i {
    static_assert_imm2!(IMM8);
    let r = _mm512_inserti32x4::<IMM8>(a, b);
    let zero = _mm512_setzero_si512().as_i32x16();
    transmute(simd_select_bitmask(k, r.as_i32x16(), zero))
}

/// Copy a to dst, then insert 128 bits (composed of 4 packed 32-bit integers) from b into dst at the location specified by imm8.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_inserti32x4&expand=3171)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(
    all(test, not(target_os = "windows")),
    assert_instr(vinsert, IMM8 = 1) //should be vinserti32x4
)]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm256_inserti32x4<const IMM8: i32>(a: __m256i, b: __m128i) -> __m256i {
    static_assert_imm1!(IMM8);
    let a = a.as_i32x8();
    let b = _mm256_castsi128_si256(b).as_i32x8();
    let ret: i32x8 = match IMM8 & 0b1 {
        0 => simd_shuffle8!(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
        _ => simd_shuffle8!(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
    };
    transmute(ret)
}

/// Copy a to tmp, then insert 128 bits (composed of 4 packed 32-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_inserti32x4&expand=3172)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(
    all(test, not(target_os = "windows")),
    assert_instr(vinserti32x4, IMM8 = 1)
)]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm256_mask_inserti32x4<const IMM8: i32>(
    src: __m256i,
    k: __mmask8,
    a: __m256i,
    b: __m128i,
) -> __m256i {
    static_assert_imm1!(IMM8);
    let r = _mm256_inserti32x4::<IMM8>(a, b);
    transmute(simd_select_bitmask(k, r.as_i32x8(), src.as_i32x8()))
}

/// Copy a to tmp, then insert 128 bits (composed of 4 packed 32-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_inserti32x4&expand=3173)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(
    all(test, not(target_os = "windows")),
    assert_instr(vinserti32x4, IMM8 = 1)
)]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm256_maskz_inserti32x4<const IMM8: i32>(
    k: __mmask8,
    a: __m256i,
    b: __m128i,
) -> __m256i {
    static_assert_imm1!(IMM8);
    let r = _mm256_inserti32x4::<IMM8>(a, b);
    let zero = _mm256_setzero_si256().as_i32x8();
    transmute(simd_select_bitmask(k, r.as_i32x8(), zero))
}

/// Copy a to dst, then insert 256 bits (composed of 4 packed 64-bit integers) from b into dst at the location specified by imm8.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_inserti64x4&expand=3186)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vinsertf64x4, IMM8 = 1))] //should be vinserti64x4
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_inserti64x4<const IMM8: i32>(a: __m512i, b: __m256i) -> __m512i {
    static_assert_imm1!(IMM8);
    let b = _mm512_castsi256_si512(b);
    match IMM8 & 0b1 {
        0 => simd_shuffle8!(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
        _ => simd_shuffle8!(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
    }
}

/// Copy a to tmp, then insert 256 bits (composed of 4 packed 64-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_inserti64x4&expand=3187)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vinserti64x4, IMM8 = 1))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask_inserti64x4<const IMM8: i32>(
    src: __m512i,
    k: __mmask8,
    a: __m512i,
    b: __m256i,
) -> __m512i {
    static_assert_imm1!(IMM8);
    let r = _mm512_inserti64x4::<IMM8>(a, b);
    transmute(simd_select_bitmask(k, r.as_i64x8(), src.as_i64x8()))
}

/// Copy a to tmp, then insert 256 bits (composed of 4 packed 64-bit integers) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_inserti64x4&expand=3188)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vinserti64x4, IMM8 = 1))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_maskz_inserti64x4<const IMM8: i32>(
    k: __mmask8,
    a: __m512i,
    b: __m256i,
) -> __m512i {
    static_assert_imm1!(IMM8);
    let r = _mm512_inserti64x4::<IMM8>(a, b);
    let zero = _mm512_setzero_si512().as_i64x8();
    transmute(simd_select_bitmask(k, r.as_i64x8(), zero))
}

/// Copy a to dst, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into dst at the location specified by imm8.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_insertf32x4&expand=3155)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vinsertf32x4, IMM8 = 2))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_insertf32x4<const IMM8: i32>(a: __m512, b: __m128) -> __m512 {
    static_assert_imm2!(IMM8);
    let b = _mm512_castps128_ps512(b);
    match IMM8 & 0b11 {
        0 => simd_shuffle16!(
            a,
            b,
            [16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
        ),
        1 => simd_shuffle16!(
            a,
            b,
            [0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 12, 13, 14, 15],
        ),
        2 => simd_shuffle16!(
            a,
            b,
            [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15],
        ),
        _ => simd_shuffle16!(a, b, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19]),
    }
}

/// Copy a to tmp, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_insertf32x4&expand=3156)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vinsertf32x4, IMM8 = 2))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask_insertf32x4<const IMM8: i32>(
    src: __m512,
    k: __mmask16,
    a: __m512,
    b: __m128,
) -> __m512 {
    static_assert_imm2!(IMM8);
    let r = _mm512_insertf32x4::<IMM8>(a, b);
    transmute(simd_select_bitmask(k, r.as_f32x16(), src.as_f32x16()))
}

/// Copy a to tmp, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_insertf32x4&expand=3157)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vinsertf32x4, IMM8 = 2))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_maskz_insertf32x4<const IMM8: i32>(
    k: __mmask16,
    a: __m512,
    b: __m128,
) -> __m512 {
    static_assert_imm2!(IMM8);
    let r = _mm512_insertf32x4::<IMM8>(a, b);
    let zero = _mm512_setzero_ps().as_f32x16();
    transmute(simd_select_bitmask(k, r.as_f32x16(), zero))
}

/// Copy a to dst, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into dst at the location specified by imm8.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_insertf32x4&expand=3152)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(
    all(test, not(target_os = "windows")),
    assert_instr(vinsert, IMM8 = 1) //should be vinsertf32x4
)]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm256_insertf32x4<const IMM8: i32>(a: __m256, b: __m128) -> __m256 {
    static_assert_imm1!(IMM8);
    let b = _mm256_castps128_ps256(b);
    match IMM8 & 0b1 {
        0 => simd_shuffle8!(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
        _ => simd_shuffle8!(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
    }
}

/// Copy a to tmp, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_insertf32x4&expand=3153)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(
    all(test, not(target_os = "windows")),
    assert_instr(vinsertf32x4, IMM8 = 1)
)]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm256_mask_insertf32x4<const IMM8: i32>(
    src: __m256,
    k: __mmask8,
    a: __m256,
    b: __m128,
) -> __m256 {
    static_assert_imm1!(IMM8);
    let r = _mm256_insertf32x4::<IMM8>(a, b);
    transmute(simd_select_bitmask(k, r.as_f32x8(), src.as_f32x8()))
}

/// Copy a to tmp, then insert 128 bits (composed of 4 packed single-precision (32-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_insertf32x4&expand=3154)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(
    all(test, not(target_os = "windows")),
    assert_instr(vinsertf32x4, IMM8 = 1)
)]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm256_maskz_insertf32x4<const IMM8: i32>(
    k: __mmask8,
    a: __m256,
    b: __m128,
) -> __m256 {
    static_assert_imm1!(IMM8);
    let r = _mm256_insertf32x4::<IMM8>(a, b);
    let zero = _mm256_setzero_ps().as_f32x8();
    transmute(simd_select_bitmask(k, r.as_f32x8(), zero))
}

/// Copy a to dst, then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from b into dst at the location specified by imm8.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_insertf64x4&expand=3167)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vinsertf64x4, IMM8 = 1))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_insertf64x4<const IMM8: i32>(a: __m512d, b: __m256d) -> __m512d {
    static_assert_imm1!(IMM8);
    let b = _mm512_castpd256_pd512(b);
    match IMM8 & 0b1 {
        0 => simd_shuffle8!(a, b, [8, 9, 10, 11, 4, 5, 6, 7]),
        _ => simd_shuffle8!(a, b, [0, 1, 2, 3, 8, 9, 10, 11]),
    }
}

/// Copy a to tmp, then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_insertf64x4&expand=3168)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vinsertf64x4, IMM8 = 1))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask_insertf64x4<const IMM8: i32>(
    src: __m512d,
    k: __mmask8,
    a: __m512d,
    b: __m256d,
) -> __m512d {
    static_assert_imm1!(IMM8);
    let r = _mm512_insertf64x4::<IMM8>(a, b);
    transmute(simd_select_bitmask(k, r.as_f64x8(), src.as_f64x8()))
}

/// Copy a to tmp, then insert 256 bits (composed of 4 packed double-precision (64-bit) floating-point elements) from b into tmp at the location specified by imm8. Store tmp to dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_insertf64x4&expand=3169)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vinsertf64x4, IMM8 = 1))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_maskz_insertf64x4<const IMM8: i32>(
    k: __mmask8,
    a: __m512d,
    b: __m256d,
) -> __m512d {
    static_assert_imm1!(IMM8);
    let r = _mm512_insertf64x4::<IMM8>(a, b);
    let zero = _mm512_setzero_pd().as_f64x8();
    transmute(simd_select_bitmask(k, r.as_f64x8(), zero))
}

/// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpackhi_epi32&expand=6021)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vunpckhps))] //should be vpunpckhdq
pub unsafe fn _mm512_unpackhi_epi32(a: __m512i, b: __m512i) -> __m512i {
    let a = a.as_i32x16();
    let b = b.as_i32x16();
    #[rustfmt::skip]
    let r: i32x16 = simd_shuffle16!(
        a, b,
        [ 2, 18, 3, 19,
          2 + 4, 18 + 4, 3 + 4, 19 + 4,
          2 + 8, 18 + 8, 3 + 8, 19 + 8,
          2 + 12, 18 + 12, 3 + 12, 19 + 12],
    );
    transmute(r)
}

/// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpackhi_epi32&expand=6019)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpunpckhdq))]
pub unsafe fn _mm512_mask_unpackhi_epi32(
    src: __m512i,
    k: __mmask16,
    a: __m512i,
    b: __m512i,
) -> __m512i {
    let unpackhi = _mm512_unpackhi_epi32(a, b).as_i32x16();
    transmute(simd_select_bitmask(k, unpackhi, src.as_i32x16()))
}

/// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpackhi_epi32&expand=6020)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpunpckhdq))]
pub unsafe fn _mm512_maskz_unpackhi_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
    let unpackhi = _mm512_unpackhi_epi32(a, b).as_i32x16();
    let zero = _mm512_setzero_si512().as_i32x16();
    transmute(simd_select_bitmask(k, unpackhi, zero))
}

/// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_unpackhi_epi32&expand=6016)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpunpckhdq))]
pub unsafe fn _mm256_mask_unpackhi_epi32(
    src: __m256i,
    k: __mmask8,
    a: __m256i,
    b: __m256i,
) -> __m256i {
    let unpackhi = _mm256_unpackhi_epi32(a, b).as_i32x8();
    transmute(simd_select_bitmask(k, unpackhi, src.as_i32x8()))
}

/// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_unpackhi_epi32&expand=6017)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpunpckhdq))]
pub unsafe fn _mm256_maskz_unpackhi_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
    let unpackhi = _mm256_unpackhi_epi32(a, b).as_i32x8();
    let zero = _mm256_setzero_si256().as_i32x8();
    transmute(simd_select_bitmask(k, unpackhi, zero))
}

/// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_unpackhi_epi32&expand=6013)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpunpckhdq))]
pub unsafe fn _mm_mask_unpackhi_epi32(
    src: __m128i,
    k: __mmask8,
    a: __m128i,
    b: __m128i,
) -> __m128i {
    let unpackhi = _mm_unpackhi_epi32(a, b).as_i32x4();
    transmute(simd_select_bitmask(k, unpackhi, src.as_i32x4()))
}

/// Unpack and interleave 32-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_unpackhi_epi32&expand=6014)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpunpckhdq))]
pub unsafe fn _mm_maskz_unpackhi_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    let unpackhi = _mm_unpackhi_epi32(a, b).as_i32x4();
    let zero = _mm_setzero_si128().as_i32x4();
    transmute(simd_select_bitmask(k, unpackhi, zero))
}

/// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpackhi_epi64&expand=6030)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vunpckhpd))] //should be vpunpckhqdq
pub unsafe fn _mm512_unpackhi_epi64(a: __m512i, b: __m512i) -> __m512i {
    simd_shuffle8!(a, b, [1, 9, 1 + 2, 9 + 2, 1 + 4, 9 + 4, 1 + 6, 9 + 6])
}

/// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpackhi_epi64&expand=6028)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpunpckhqdq))]
pub unsafe fn _mm512_mask_unpackhi_epi64(
    src: __m512i,
    k: __mmask8,
    a: __m512i,
    b: __m512i,
) -> __m512i {
    let unpackhi = _mm512_unpackhi_epi64(a, b).as_i64x8();
    transmute(simd_select_bitmask(k, unpackhi, src.as_i64x8()))
}

/// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpackhi_epi64&expand=6029)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpunpckhqdq))]
pub unsafe fn _mm512_maskz_unpackhi_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
    let unpackhi = _mm512_unpackhi_epi64(a, b).as_i64x8();
    let zero = _mm512_setzero_si512().as_i64x8();
    transmute(simd_select_bitmask(k, unpackhi, zero))
}

/// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_unpackhi_epi64&expand=6025)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpunpckhqdq))]
pub unsafe fn _mm256_mask_unpackhi_epi64(
    src: __m256i,
    k: __mmask8,
    a: __m256i,
    b: __m256i,
) -> __m256i {
    let unpackhi = _mm256_unpackhi_epi64(a, b).as_i64x4();
    transmute(simd_select_bitmask(k, unpackhi, src.as_i64x4()))
}

/// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_unpackhi_epi64&expand=6026)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpunpckhqdq))]
pub unsafe fn _mm256_maskz_unpackhi_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
    let unpackhi = _mm256_unpackhi_epi64(a, b).as_i64x4();
    let zero = _mm256_setzero_si256().as_i64x4();
    transmute(simd_select_bitmask(k, unpackhi, zero))
}

/// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_unpackhi_epi64&expand=6022)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpunpckhqdq))]
pub unsafe fn _mm_mask_unpackhi_epi64(
    src: __m128i,
    k: __mmask8,
    a: __m128i,
    b: __m128i,
) -> __m128i {
    let unpackhi = _mm_unpackhi_epi64(a, b).as_i64x2();
    transmute(simd_select_bitmask(k, unpackhi, src.as_i64x2()))
}

/// Unpack and interleave 64-bit integers from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_unpackhi_epi64&expand=6023)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpunpckhqdq))]
pub unsafe fn _mm_maskz_unpackhi_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    let unpackhi = _mm_unpackhi_epi64(a, b).as_i64x2();
    let zero = _mm_setzero_si128().as_i64x2();
    transmute(simd_select_bitmask(k, unpackhi, zero))
}

/// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpackhi_ps&expand=6060)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vunpckhps))]
pub unsafe fn _mm512_unpackhi_ps(a: __m512, b: __m512) -> __m512 {
    #[rustfmt::skip]
    simd_shuffle16!(
        a, b,
        [ 2, 18, 3, 19,
          2 + 4, 18 + 4, 3 + 4, 19 + 4,
          2 + 8, 18 + 8, 3 + 8, 19 + 8,
          2 + 12, 18 + 12, 3 + 12, 19 + 12],
    )
}

/// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpackhi_ps&expand=6058)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vunpckhps))]
pub unsafe fn _mm512_mask_unpackhi_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
    let unpackhi = _mm512_unpackhi_ps(a, b).as_f32x16();
    transmute(simd_select_bitmask(k, unpackhi, src.as_f32x16()))
}

/// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpackhi_ps&expand=6059)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vunpckhps))]
pub unsafe fn _mm512_maskz_unpackhi_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
    let unpackhi = _mm512_unpackhi_ps(a, b).as_f32x16();
    let zero = _mm512_setzero_ps().as_f32x16();
    transmute(simd_select_bitmask(k, unpackhi, zero))
}

/// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_unpackhi_ps&expand=6055)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vunpckhps))]
pub unsafe fn _mm256_mask_unpackhi_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
    let unpackhi = _mm256_unpackhi_ps(a, b).as_f32x8();
    transmute(simd_select_bitmask(k, unpackhi, src.as_f32x8()))
}

/// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_unpackhi_ps&expand=6056)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vunpckhps))]
pub unsafe fn _mm256_maskz_unpackhi_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
    let unpackhi = _mm256_unpackhi_ps(a, b).as_f32x8();
    let zero = _mm256_setzero_ps().as_f32x8();
    transmute(simd_select_bitmask(k, unpackhi, zero))
}

/// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_unpackhi_ps&expand=6052)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vunpckhps))]
pub unsafe fn _mm_mask_unpackhi_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
    let unpackhi = _mm_unpackhi_ps(a, b).as_f32x4();
    transmute(simd_select_bitmask(k, unpackhi, src.as_f32x4()))
}

/// Unpack and interleave single-precision (32-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_unpackhi_ps&expand=6053)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vunpckhps))]
pub unsafe fn _mm_maskz_unpackhi_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
    let unpackhi = _mm_unpackhi_ps(a, b).as_f32x4();
    let zero = _mm_setzero_ps().as_f32x4();
    transmute(simd_select_bitmask(k, unpackhi, zero))
}

/// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpackhi_pd&expand=6048)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vunpckhpd))]
pub unsafe fn _mm512_unpackhi_pd(a: __m512d, b: __m512d) -> __m512d {
    simd_shuffle8!(a, b, [1, 9, 1 + 2, 9 + 2, 1 + 4, 9 + 4, 1 + 6, 9 + 6])
}

/// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpackhi_pd&expand=6046)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vunpckhpd))]
pub unsafe fn _mm512_mask_unpackhi_pd(
    src: __m512d,
    k: __mmask8,
    a: __m512d,
    b: __m512d,
) -> __m512d {
    let unpackhi = _mm512_unpackhi_pd(a, b).as_f64x8();
    transmute(simd_select_bitmask(k, unpackhi, src.as_f64x8()))
}

/// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpackhi_pd&expand=6047)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vunpckhpd))]
pub unsafe fn _mm512_maskz_unpackhi_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
    let unpackhi = _mm512_unpackhi_pd(a, b).as_f64x8();
    let zero = _mm512_setzero_pd().as_f64x8();
    transmute(simd_select_bitmask(k, unpackhi, zero))
}

/// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_unpackhi_pd&expand=6043)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vunpckhpd))]
pub unsafe fn _mm256_mask_unpackhi_pd(
    src: __m256d,
    k: __mmask8,
    a: __m256d,
    b: __m256d,
) -> __m256d {
    let unpackhi = _mm256_unpackhi_pd(a, b).as_f64x4();
    transmute(simd_select_bitmask(k, unpackhi, src.as_f64x4()))
}

/// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_unpackhi_pd&expand=6044)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vunpckhpd))]
pub unsafe fn _mm256_maskz_unpackhi_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
    let unpackhi = _mm256_unpackhi_pd(a, b).as_f64x4();
    let zero = _mm256_setzero_pd().as_f64x4();
    transmute(simd_select_bitmask(k, unpackhi, zero))
}

/// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_unpackhi_pd&expand=6040)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vunpckhpd))]
pub unsafe fn _mm_mask_unpackhi_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
    let unpackhi = _mm_unpackhi_pd(a, b).as_f64x2();
    transmute(simd_select_bitmask(k, unpackhi, src.as_f64x2()))
}

/// Unpack and interleave double-precision (64-bit) floating-point elements from the high half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_unpackhi_pd&expand=6041)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vunpckhpd))]
pub unsafe fn _mm_maskz_unpackhi_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
    let unpackhi = _mm_unpackhi_pd(a, b).as_f64x2();
    let zero = _mm_setzero_pd().as_f64x2();
    transmute(simd_select_bitmask(k, unpackhi, zero))
}

/// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpacklo_epi32&expand=6078)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vunpcklps))] //should be vpunpckldq
pub unsafe fn _mm512_unpacklo_epi32(a: __m512i, b: __m512i) -> __m512i {
    let a = a.as_i32x16();
    let b = b.as_i32x16();
    #[rustfmt::skip]
    let r: i32x16 = simd_shuffle16!(
        a, b,
        [ 0, 16, 1, 17,
          0 + 4, 16 + 4, 1 + 4, 17 + 4,
          0 + 8, 16 + 8, 1 + 8, 17 + 8,
          0 + 12, 16 + 12, 1 + 12, 17 + 12],
    );
    transmute(r)
}

/// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpacklo_epi32&expand=6076)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpunpckldq))]
pub unsafe fn _mm512_mask_unpacklo_epi32(
    src: __m512i,
    k: __mmask16,
    a: __m512i,
    b: __m512i,
) -> __m512i {
    let unpacklo = _mm512_unpacklo_epi32(a, b).as_i32x16();
    transmute(simd_select_bitmask(k, unpacklo, src.as_i32x16()))
}

/// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpacklo_epi32&expand=6077)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpunpckldq))]
pub unsafe fn _mm512_maskz_unpacklo_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
    let unpacklo = _mm512_unpacklo_epi32(a, b).as_i32x16();
    let zero = _mm512_setzero_si512().as_i32x16();
    transmute(simd_select_bitmask(k, unpacklo, zero))
}

/// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_unpacklo_epi32&expand=6073)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpunpckldq))]
pub unsafe fn _mm256_mask_unpacklo_epi32(
    src: __m256i,
    k: __mmask8,
    a: __m256i,
    b: __m256i,
) -> __m256i {
    let unpacklo = _mm256_unpacklo_epi32(a, b).as_i32x8();
    transmute(simd_select_bitmask(k, unpacklo, src.as_i32x8()))
}

/// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_unpacklo_epi32&expand=6074)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpunpckldq))]
pub unsafe fn _mm256_maskz_unpacklo_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
    let unpacklo = _mm256_unpacklo_epi32(a, b).as_i32x8();
    let zero = _mm256_setzero_si256().as_i32x8();
    transmute(simd_select_bitmask(k, unpacklo, zero))
}

/// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_unpacklo_epi32&expand=6070)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpunpckldq))]
pub unsafe fn _mm_mask_unpacklo_epi32(
    src: __m128i,
    k: __mmask8,
    a: __m128i,
    b: __m128i,
) -> __m128i {
    let unpacklo = _mm_unpacklo_epi32(a, b).as_i32x4();
    transmute(simd_select_bitmask(k, unpacklo, src.as_i32x4()))
}

/// Unpack and interleave 32-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_unpacklo_epi32&expand=6071)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpunpckldq))]
pub unsafe fn _mm_maskz_unpacklo_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    let unpacklo = _mm_unpacklo_epi32(a, b).as_i32x4();
    let zero = _mm_setzero_si128().as_i32x4();
    transmute(simd_select_bitmask(k, unpacklo, zero))
}

/// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpacklo_epi64&expand=6087)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vunpcklpd))] //should be vpunpcklqdq
pub unsafe fn _mm512_unpacklo_epi64(a: __m512i, b: __m512i) -> __m512i {
    simd_shuffle8!(a, b, [0, 8, 0 + 2, 8 + 2, 0 + 4, 8 + 4, 0 + 6, 8 + 6])
}

/// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpacklo_epi64&expand=6085)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpunpcklqdq))]
pub unsafe fn _mm512_mask_unpacklo_epi64(
    src: __m512i,
    k: __mmask8,
    a: __m512i,
    b: __m512i,
) -> __m512i {
    let unpacklo = _mm512_unpacklo_epi64(a, b).as_i64x8();
    transmute(simd_select_bitmask(k, unpacklo, src.as_i64x8()))
}

/// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpacklo_epi64&expand=6086)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpunpcklqdq))]
pub unsafe fn _mm512_maskz_unpacklo_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
    let unpacklo = _mm512_unpacklo_epi64(a, b).as_i64x8();
    let zero = _mm512_setzero_si512().as_i64x8();
    transmute(simd_select_bitmask(k, unpacklo, zero))
}

/// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_unpacklo_epi64&expand=6082)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpunpcklqdq))]
pub unsafe fn _mm256_mask_unpacklo_epi64(
    src: __m256i,
    k: __mmask8,
    a: __m256i,
    b: __m256i,
) -> __m256i {
    let unpacklo = _mm256_unpacklo_epi64(a, b).as_i64x4();
    transmute(simd_select_bitmask(k, unpacklo, src.as_i64x4()))
}

/// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_unpacklo_epi64&expand=6083)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpunpcklqdq))]
pub unsafe fn _mm256_maskz_unpacklo_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
    let unpacklo = _mm256_unpacklo_epi64(a, b).as_i64x4();
    let zero = _mm256_setzero_si256().as_i64x4();
    transmute(simd_select_bitmask(k, unpacklo, zero))
}

/// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_unpacklo_epi64&expand=6079)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpunpcklqdq))]
pub unsafe fn _mm_mask_unpacklo_epi64(
    src: __m128i,
    k: __mmask8,
    a: __m128i,
    b: __m128i,
) -> __m128i {
    let unpacklo = _mm_unpacklo_epi64(a, b).as_i64x2();
    transmute(simd_select_bitmask(k, unpacklo, src.as_i64x2()))
}

/// Unpack and interleave 64-bit integers from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_unpacklo_epi64&expand=6080)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpunpcklqdq))]
pub unsafe fn _mm_maskz_unpacklo_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    let unpacklo = _mm_unpacklo_epi64(a, b).as_i64x2();
    let zero = _mm_setzero_si128().as_i64x2();
    transmute(simd_select_bitmask(k, unpacklo, zero))
}

/// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpacklo_ps&expand=6117)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vunpcklps))]
pub unsafe fn _mm512_unpacklo_ps(a: __m512, b: __m512) -> __m512 {
    #[rustfmt::skip]
    simd_shuffle16!(a, b,
                   [ 0, 16, 1, 17,
                     0 + 4, 16 + 4, 1 + 4, 17 + 4,
                     0 + 8, 16 + 8, 1 + 8, 17 + 8,
                     0 + 12, 16 + 12, 1 + 12, 17 + 12],
    )
}

/// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpacklo_ps&expand=6115)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vunpcklps))]
pub unsafe fn _mm512_mask_unpacklo_ps(src: __m512, k: __mmask16, a: __m512, b: __m512) -> __m512 {
    let unpacklo = _mm512_unpacklo_ps(a, b).as_f32x16();
    transmute(simd_select_bitmask(k, unpacklo, src.as_f32x16()))
}

/// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpacklo_ps&expand=6116)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vunpcklps))]
pub unsafe fn _mm512_maskz_unpacklo_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
    let unpacklo = _mm512_unpacklo_ps(a, b).as_f32x16();
    let zero = _mm512_setzero_ps().as_f32x16();
    transmute(simd_select_bitmask(k, unpacklo, zero))
}

/// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_unpacklo_ps&expand=6112)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vunpcklps))]
pub unsafe fn _mm256_mask_unpacklo_ps(src: __m256, k: __mmask8, a: __m256, b: __m256) -> __m256 {
    let unpacklo = _mm256_unpacklo_ps(a, b).as_f32x8();
    transmute(simd_select_bitmask(k, unpacklo, src.as_f32x8()))
}

/// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_unpacklo_ps&expand=6113)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vunpcklps))]
pub unsafe fn _mm256_maskz_unpacklo_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
    let unpacklo = _mm256_unpacklo_ps(a, b).as_f32x8();
    let zero = _mm256_setzero_ps().as_f32x8();
    transmute(simd_select_bitmask(k, unpacklo, zero))
}

/// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_unpacklo_ps&expand=6109)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vunpcklps))]
pub unsafe fn _mm_mask_unpacklo_ps(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
    let unpacklo = _mm_unpacklo_ps(a, b).as_f32x4();
    transmute(simd_select_bitmask(k, unpacklo, src.as_f32x4()))
}

/// Unpack and interleave single-precision (32-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_unpacklo_ps&expand=6110)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vunpcklps))]
pub unsafe fn _mm_maskz_unpacklo_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
    let unpacklo = _mm_unpacklo_ps(a, b).as_f32x4();
    let zero = _mm_setzero_ps().as_f32x4();
    transmute(simd_select_bitmask(k, unpacklo, zero))
}

/// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_unpacklo_pd&expand=6105)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vunpcklpd))]
pub unsafe fn _mm512_unpacklo_pd(a: __m512d, b: __m512d) -> __m512d {
    simd_shuffle8!(a, b, [0, 8, 0 + 2, 8 + 2, 0 + 4, 8 + 4, 0 + 6, 8 + 6])
}

/// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_unpacklo_pd&expand=6103)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vunpcklpd))]
pub unsafe fn _mm512_mask_unpacklo_pd(
    src: __m512d,
    k: __mmask8,
    a: __m512d,
    b: __m512d,
) -> __m512d {
    let unpacklo = _mm512_unpacklo_pd(a, b).as_f64x8();
    transmute(simd_select_bitmask(k, unpacklo, src.as_f64x8()))
}

/// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_unpacklo_pd&expand=6104)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vunpcklpd))]
pub unsafe fn _mm512_maskz_unpacklo_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
    let unpacklo = _mm512_unpacklo_pd(a, b).as_f64x8();
    let zero = _mm512_setzero_pd().as_f64x8();
    transmute(simd_select_bitmask(k, unpacklo, zero))
}

/// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_unpacklo_pd&expand=6100)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vunpcklpd))]
pub unsafe fn _mm256_mask_unpacklo_pd(
    src: __m256d,
    k: __mmask8,
    a: __m256d,
    b: __m256d,
) -> __m256d {
    let unpacklo = _mm256_unpacklo_pd(a, b).as_f64x4();
    transmute(simd_select_bitmask(k, unpacklo, src.as_f64x4()))
}

/// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_unpacklo_pd&expand=6101)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vunpcklpd))]
pub unsafe fn _mm256_maskz_unpacklo_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
    let unpacklo = _mm256_unpacklo_pd(a, b).as_f64x4();
    let zero = _mm256_setzero_pd().as_f64x4();
    transmute(simd_select_bitmask(k, unpacklo, zero))
}

/// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_unpacklo_pd&expand=6097)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vunpcklpd))]
pub unsafe fn _mm_mask_unpacklo_pd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
    let unpacklo = _mm_unpacklo_pd(a, b).as_f64x2();
    transmute(simd_select_bitmask(k, unpacklo, src.as_f64x2()))
}

/// Unpack and interleave double-precision (64-bit) floating-point elements from the low half of each 128-bit lane in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_unpacklo_pd&expand=6098)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vunpcklpd))]
pub unsafe fn _mm_maskz_unpacklo_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
    let unpacklo = _mm_unpacklo_pd(a, b).as_f64x2();
    let zero = _mm_setzero_pd().as_f64x2();
    transmute(simd_select_bitmask(k, unpacklo, zero))
}

/// Cast vector of type __m128 to type __m512; the upper 384 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castps128_ps512&expand=621)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_castps128_ps512(a: __m128) -> __m512 {
    simd_shuffle16!(
        a,
        _mm_set1_ps(-1.),
        [0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4],
    )
}

/// Cast vector of type __m256 to type __m512; the upper 256 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castps256_ps512&expand=623)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_castps256_ps512(a: __m256) -> __m512 {
    simd_shuffle16!(
        a,
        _mm256_set1_ps(-1.),
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8],
    )
}

/// Cast vector of type __m128 to type __m512; the upper 384 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_zextps128_ps512&expand=6196)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_zextps128_ps512(a: __m128) -> __m512 {
    simd_shuffle16!(
        a,
        _mm_set1_ps(0.),
        [0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4],
    )
}

/// Cast vector of type __m256 to type __m512; the upper 256 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_zextps256_ps512&expand=6197)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_zextps256_ps512(a: __m256) -> __m512 {
    simd_shuffle16!(
        a,
        _mm256_set1_ps(0.),
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8],
    )
}

/// Cast vector of type __m512 to type __m128. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castps512_ps128&expand=624)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_castps512_ps128(a: __m512) -> __m128 {
    simd_shuffle4!(a, a, [0, 1, 2, 3])
}

/// Cast vector of type __m512 to type __m256. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castps512_ps256&expand=625)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_castps512_ps256(a: __m512) -> __m256 {
    simd_shuffle8!(a, a, [0, 1, 2, 3, 4, 5, 6, 7])
}

/// Cast vector of type __m512 to type __m512d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castps_pd&expand=616)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_castps_pd(a: __m512) -> __m512d {
    transmute(a.as_m512())
}

/// Cast vector of type __m512 to type __m512i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castps_si512&expand=619)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_castps_si512(a: __m512) -> __m512i {
    transmute(a.as_m512())
}

/// Cast vector of type __m128d to type __m512d; the upper 384 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castpd128_pd512&expand=609)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_castpd128_pd512(a: __m128d) -> __m512d {
    simd_shuffle8!(a, _mm_set1_pd(-1.), [0, 1, 2, 2, 2, 2, 2, 2])
}

/// Cast vector of type __m256d to type __m512d; the upper 256 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castpd256_pd512&expand=611)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_castpd256_pd512(a: __m256d) -> __m512d {
    simd_shuffle8!(a, _mm256_set1_pd(-1.), [0, 1, 2, 3, 4, 4, 4, 4])
}

/// Cast vector of type __m128d to type __m512d; the upper 384 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_zextpd128_pd512&expand=6193)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_zextpd128_pd512(a: __m128d) -> __m512d {
    simd_shuffle8!(a, _mm_set1_pd(0.), [0, 1, 2, 2, 2, 2, 2, 2])
}

/// Cast vector of type __m256d to type __m512d; the upper 256 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_zextpd256_pd512&expand=6194)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_zextpd256_pd512(a: __m256d) -> __m512d {
    simd_shuffle8!(a, _mm256_set1_pd(0.), [0, 1, 2, 3, 4, 4, 4, 4])
}

/// Cast vector of type __m512d to type __m128d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castpd512_pd128&expand=612)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_castpd512_pd128(a: __m512d) -> __m128d {
    simd_shuffle2!(a, a, [0, 1])
}

/// Cast vector of type __m512d to type __m256d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castpd512_pd256&expand=613)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_castpd512_pd256(a: __m512d) -> __m256d {
    simd_shuffle4!(a, a, [0, 1, 2, 3])
}

/// Cast vector of type __m512d to type __m512. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castpd_ps&expand=604)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_castpd_ps(a: __m512d) -> __m512 {
    transmute(a.as_m512d())
}

/// Cast vector of type __m512d to type __m512i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castpd_si512&expand=607)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_castpd_si512(a: __m512d) -> __m512i {
    transmute(a.as_m512d())
}

/// Cast vector of type __m128i to type __m512i; the upper 384 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castsi128_si512&expand=629)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_castsi128_si512(a: __m128i) -> __m512i {
    simd_shuffle8!(a, _mm_set1_epi64x(-1), [0, 1, 2, 2, 2, 2, 2, 2])
}

/// Cast vector of type __m256i to type __m512i; the upper 256 bits of the result are undefined. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castsi256_si512&expand=633)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_castsi256_si512(a: __m256i) -> __m512i {
    simd_shuffle8!(a, _mm256_set1_epi64x(-1), [0, 1, 2, 3, 4, 4, 4, 4])
}

/// Cast vector of type __m128i to type __m512i; the upper 384 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_zextsi128_si512&expand=6199)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_zextsi128_si512(a: __m128i) -> __m512i {
    simd_shuffle8!(a, _mm_set1_epi64x(0), [0, 1, 2, 2, 2, 2, 2, 2])
}

/// Cast vector of type __m256i to type __m512i; the upper 256 bits of the result are zeroed. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_zextsi256_si512&expand=6200)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_zextsi256_si512(a: __m256i) -> __m512i {
    simd_shuffle8!(a, _mm256_set1_epi64x(0), [0, 1, 2, 3, 4, 4, 4, 4])
}

/// Cast vector of type __m512i to type __m128i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castsi512_si128&expand=636)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_castsi512_si128(a: __m512i) -> __m128i {
    simd_shuffle2!(a, a, [0, 1])
}

/// Cast vector of type __m512i to type __m256i. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castsi512_si256&expand=637)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_castsi512_si256(a: __m512i) -> __m256i {
    simd_shuffle4!(a, a, [0, 1, 2, 3])
}

/// Cast vector of type __m512i to type __m512. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castsi512_ps&expand=635)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_castsi512_ps(a: __m512i) -> __m512 {
    transmute(a)
}

/// Cast vector of type __m512i to type __m512d. This intrinsic is only used for compilation and does not generate any instructions, thus it has zero latency.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_castsi512_pd&expand=634)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_castsi512_pd(a: __m512i) -> __m512d {
    transmute(a)
}

/// Copy the lower 32-bit integer in a to dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cvtsi512_si32&expand=1882)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(all(test, not(target_os = "windows")), assert_instr(vmovd))]
pub unsafe fn _mm512_cvtsi512_si32(a: __m512i) -> i32 {
    let extract: i32 = simd_extract(a.as_i32x16(), 0);
    transmute(extract)
}

/// Broadcast the low packed 32-bit integer from a to all elements of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcastd_epi32&expand=545)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vbroadcast))] //should be vpbroadcastd
pub unsafe fn _mm512_broadcastd_epi32(a: __m128i) -> __m512i {
    let a = _mm512_castsi128_si512(a).as_i32x16();
    let ret: i32x16 = simd_shuffle16!(a, a, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
    transmute(ret)
}

/// Broadcast the low packed 32-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcastd_epi32&expand=546)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastd
pub unsafe fn _mm512_mask_broadcastd_epi32(src: __m512i, k: __mmask16, a: __m128i) -> __m512i {
    let broadcast = _mm512_broadcastd_epi32(a).as_i32x16();
    transmute(simd_select_bitmask(k, broadcast, src.as_i32x16()))
}

/// Broadcast the low packed 32-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcastd_epi32&expand=547)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastd
pub unsafe fn _mm512_maskz_broadcastd_epi32(k: __mmask16, a: __m128i) -> __m512i {
    let broadcast = _mm512_broadcastd_epi32(a).as_i32x16();
    let zero = _mm512_setzero_si512().as_i32x16();
    transmute(simd_select_bitmask(k, broadcast, zero))
}

/// Broadcast the low packed 32-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_broadcastd_epi32&expand=543)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastd
pub unsafe fn _mm256_mask_broadcastd_epi32(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
    let broadcast = _mm256_broadcastd_epi32(a).as_i32x8();
    transmute(simd_select_bitmask(k, broadcast, src.as_i32x8()))
}

/// Broadcast the low packed 32-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_broadcastd_epi32&expand=544)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastd
pub unsafe fn _mm256_maskz_broadcastd_epi32(k: __mmask8, a: __m128i) -> __m256i {
    let broadcast = _mm256_broadcastd_epi32(a).as_i32x8();
    let zero = _mm256_setzero_si256().as_i32x8();
    transmute(simd_select_bitmask(k, broadcast, zero))
}

/// Broadcast the low packed 32-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_broadcastd_epi32&expand=540)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastd
pub unsafe fn _mm_mask_broadcastd_epi32(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
    let broadcast = _mm_broadcastd_epi32(a).as_i32x4();
    transmute(simd_select_bitmask(k, broadcast, src.as_i32x4()))
}

/// Broadcast the low packed 32-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_broadcastd_epi32&expand=541)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastd
pub unsafe fn _mm_maskz_broadcastd_epi32(k: __mmask8, a: __m128i) -> __m128i {
    let broadcast = _mm_broadcastd_epi32(a).as_i32x4();
    let zero = _mm_setzero_si128().as_i32x4();
    transmute(simd_select_bitmask(k, broadcast, zero))
}

/// Broadcast the low packed 64-bit integer from a to all elements of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcastq_epi64&expand=560)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vbroadcas))] //should be vpbroadcastq
pub unsafe fn _mm512_broadcastq_epi64(a: __m128i) -> __m512i {
    simd_shuffle8!(a, a, [0, 0, 0, 0, 0, 0, 0, 0])
}

/// Broadcast the low packed 64-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcastq_epi64&expand=561)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastq
pub unsafe fn _mm512_mask_broadcastq_epi64(src: __m512i, k: __mmask8, a: __m128i) -> __m512i {
    let broadcast = _mm512_broadcastq_epi64(a).as_i64x8();
    transmute(simd_select_bitmask(k, broadcast, src.as_i64x8()))
}

/// Broadcast the low packed 64-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcastq_epi64&expand=562)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastq
pub unsafe fn _mm512_maskz_broadcastq_epi64(k: __mmask8, a: __m128i) -> __m512i {
    let broadcast = _mm512_broadcastq_epi64(a).as_i64x8();
    let zero = _mm512_setzero_si512().as_i64x8();
    transmute(simd_select_bitmask(k, broadcast, zero))
}

/// Broadcast the low packed 64-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_broadcastq_epi64&expand=558)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastq
pub unsafe fn _mm256_mask_broadcastq_epi64(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
    let broadcast = _mm256_broadcastq_epi64(a).as_i64x4();
    transmute(simd_select_bitmask(k, broadcast, src.as_i64x4()))
}

/// Broadcast the low packed 64-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_broadcastq_epi64&expand=559)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastq
pub unsafe fn _mm256_maskz_broadcastq_epi64(k: __mmask8, a: __m128i) -> __m256i {
    let broadcast = _mm256_broadcastq_epi64(a).as_i64x4();
    let zero = _mm256_setzero_si256().as_i64x4();
    transmute(simd_select_bitmask(k, broadcast, zero))
}

/// Broadcast the low packed 64-bit integer from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_broadcastq_epi64&expand=555)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastq
pub unsafe fn _mm_mask_broadcastq_epi64(src: __m128i, k: __mmask8, a: __m128i) -> __m128i {
    let broadcast = _mm_broadcastq_epi64(a).as_i64x2();
    transmute(simd_select_bitmask(k, broadcast, src.as_i64x2()))
}

/// Broadcast the low packed 64-bit integer from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_broadcastq_epi64&expand=556)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpbroadcast))] //should be vpbroadcastq
pub unsafe fn _mm_maskz_broadcastq_epi64(k: __mmask8, a: __m128i) -> __m128i {
    let broadcast = _mm_broadcastq_epi64(a).as_i64x2();
    let zero = _mm_setzero_si128().as_i64x2();
    transmute(simd_select_bitmask(k, broadcast, zero))
}

/// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcastss_ps&expand=578)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vbroadcastss))]
pub unsafe fn _mm512_broadcastss_ps(a: __m128) -> __m512 {
    simd_shuffle16!(a, a, [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
}

/// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcastss_ps&expand=579)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vbroadcastss))]
pub unsafe fn _mm512_mask_broadcastss_ps(src: __m512, k: __mmask16, a: __m128) -> __m512 {
    let broadcast = _mm512_broadcastss_ps(a).as_f32x16();
    transmute(simd_select_bitmask(k, broadcast, src.as_f32x16()))
}

/// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcastss_ps&expand=580)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vbroadcastss))]
pub unsafe fn _mm512_maskz_broadcastss_ps(k: __mmask16, a: __m128) -> __m512 {
    let broadcast = _mm512_broadcastss_ps(a).as_f32x16();
    let zero = _mm512_setzero_ps().as_f32x16();
    transmute(simd_select_bitmask(k, broadcast, zero))
}

/// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_broadcastss_ps&expand=576)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vbroadcastss))]
pub unsafe fn _mm256_mask_broadcastss_ps(src: __m256, k: __mmask8, a: __m128) -> __m256 {
    let broadcast = _mm256_broadcastss_ps(a).as_f32x8();
    transmute(simd_select_bitmask(k, broadcast, src.as_f32x8()))
}

/// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_broadcastss_ps&expand=577)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vbroadcastss))]
pub unsafe fn _mm256_maskz_broadcastss_ps(k: __mmask8, a: __m128) -> __m256 {
    let broadcast = _mm256_broadcastss_ps(a).as_f32x8();
    let zero = _mm256_setzero_ps().as_f32x8();
    transmute(simd_select_bitmask(k, broadcast, zero))
}

/// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_broadcastss_ps&expand=573)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vbroadcastss))]
pub unsafe fn _mm_mask_broadcastss_ps(src: __m128, k: __mmask8, a: __m128) -> __m128 {
    let broadcast = _mm_broadcastss_ps(a).as_f32x4();
    transmute(simd_select_bitmask(k, broadcast, src.as_f32x4()))
}

/// Broadcast the low single-precision (32-bit) floating-point element from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_broadcastss_ps&expand=574)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vbroadcastss))]
pub unsafe fn _mm_maskz_broadcastss_ps(k: __mmask8, a: __m128) -> __m128 {
    let broadcast = _mm_broadcastss_ps(a).as_f32x4();
    let zero = _mm_setzero_ps().as_f32x4();
    transmute(simd_select_bitmask(k, broadcast, zero))
}

/// Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcastsd_pd&expand=567)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vbroadcastsd))]
pub unsafe fn _mm512_broadcastsd_pd(a: __m128d) -> __m512d {
    simd_shuffle8!(a, a, [0, 0, 0, 0, 0, 0, 0, 0])
}

/// Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcastsd_pd&expand=568)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vbroadcastsd))]
pub unsafe fn _mm512_mask_broadcastsd_pd(src: __m512d, k: __mmask8, a: __m128d) -> __m512d {
    let broadcast = _mm512_broadcastsd_pd(a).as_f64x8();
    transmute(simd_select_bitmask(k, broadcast, src.as_f64x8()))
}

/// Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcastsd_pd&expand=569)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vbroadcastsd))]
pub unsafe fn _mm512_maskz_broadcastsd_pd(k: __mmask8, a: __m128d) -> __m512d {
    let broadcast = _mm512_broadcastsd_pd(a).as_f64x8();
    let zero = _mm512_setzero_pd().as_f64x8();
    transmute(simd_select_bitmask(k, broadcast, zero))
}

/// Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_broadcastsd_pd&expand=565)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vbroadcastsd))]
pub unsafe fn _mm256_mask_broadcastsd_pd(src: __m256d, k: __mmask8, a: __m128d) -> __m256d {
    let broadcast = _mm256_broadcastsd_pd(a).as_f64x4();
    transmute(simd_select_bitmask(k, broadcast, src.as_f64x4()))
}

/// Broadcast the low double-precision (64-bit) floating-point element from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_broadcastsd_pd&expand=566)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vbroadcastsd))]
pub unsafe fn _mm256_maskz_broadcastsd_pd(k: __mmask8, a: __m128d) -> __m256d {
    let broadcast = _mm256_broadcastsd_pd(a).as_f64x4();
    let zero = _mm256_setzero_pd().as_f64x4();
    transmute(simd_select_bitmask(k, broadcast, zero))
}

/// Broadcast the 4 packed 32-bit integers from a to all elements of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcast_i32x4&expand=510)
#[inline]
#[target_feature(enable = "avx512f")] //msvc: vbroadcasti32x4, linux: vshuf
pub unsafe fn _mm512_broadcast_i32x4(a: __m128i) -> __m512i {
    let a = a.as_i32x4();
    let ret: i32x16 = simd_shuffle16!(a, a, [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]);
    transmute(ret)
}

/// Broadcast the 4 packed 32-bit integers from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcast_i32x4&expand=511)
#[inline]
#[target_feature(enable = "avx512f")] //msvc: vbroadcasti32x4, linux: vshuf
pub unsafe fn _mm512_mask_broadcast_i32x4(src: __m512i, k: __mmask16, a: __m128i) -> __m512i {
    let broadcast = _mm512_broadcast_i32x4(a).as_i32x16();
    transmute(simd_select_bitmask(k, broadcast, src.as_i32x16()))
}

/// Broadcast the 4 packed 32-bit integers from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcast_i32x4&expand=512)
#[inline]
#[target_feature(enable = "avx512f")] //msvc: vbroadcasti32x4, linux: vshuf
pub unsafe fn _mm512_maskz_broadcast_i32x4(k: __mmask16, a: __m128i) -> __m512i {
    let broadcast = _mm512_broadcast_i32x4(a).as_i32x16();
    let zero = _mm512_setzero_si512().as_i32x16();
    transmute(simd_select_bitmask(k, broadcast, zero))
}

/// Broadcast the 4 packed 32-bit integers from a to all elements of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcast_i32x4&expand=507)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")] //msvc: vbroadcasti32x4, linux: vshuf
pub unsafe fn _mm256_broadcast_i32x4(a: __m128i) -> __m256i {
    let a = a.as_i32x4();
    let ret: i32x8 = simd_shuffle8!(a, a, [0, 1, 2, 3, 0, 1, 2, 3]);
    transmute(ret)
}

/// Broadcast the 4 packed 32-bit integers from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_broadcast_i32x4&expand=508)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")] //msvc: vbroadcasti32x4, linux: vshuf
pub unsafe fn _mm256_mask_broadcast_i32x4(src: __m256i, k: __mmask8, a: __m128i) -> __m256i {
    let broadcast = _mm256_broadcast_i32x4(a).as_i32x8();
    transmute(simd_select_bitmask(k, broadcast, src.as_i32x8()))
}

/// Broadcast the 4 packed 32-bit integers from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_broadcast_i32x4&expand=509)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")] //msvc: vbroadcasti32x4, linux: vshuf
pub unsafe fn _mm256_maskz_broadcast_i32x4(k: __mmask8, a: __m128i) -> __m256i {
    let broadcast = _mm256_broadcast_i32x4(a).as_i32x8();
    let zero = _mm256_setzero_si256().as_i32x8();
    transmute(simd_select_bitmask(k, broadcast, zero))
}

/// Broadcast the 4 packed 64-bit integers from a to all elements of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcast_i64x4&expand=522)
#[inline]
#[target_feature(enable = "avx512f")] //msvc: vbroadcasti64x4, linux: vperm
pub unsafe fn _mm512_broadcast_i64x4(a: __m256i) -> __m512i {
    simd_shuffle8!(a, a, [0, 1, 2, 3, 0, 1, 2, 3])
}

/// Broadcast the 4 packed 64-bit integers from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcast_i64x4&expand=523)
#[inline]
#[target_feature(enable = "avx512f")] //msvc: vbroadcasti64x4, linux: vperm
pub unsafe fn _mm512_mask_broadcast_i64x4(src: __m512i, k: __mmask8, a: __m256i) -> __m512i {
    let broadcast = _mm512_broadcast_i64x4(a).as_i64x8();
    transmute(simd_select_bitmask(k, broadcast, src.as_i64x8()))
}

/// Broadcast the 4 packed 64-bit integers from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcast_i64x4&expand=524)
#[inline]
#[target_feature(enable = "avx512f")] //msvc: vbroadcasti64x4, linux: vperm
pub unsafe fn _mm512_maskz_broadcast_i64x4(k: __mmask8, a: __m256i) -> __m512i {
    let broadcast = _mm512_broadcast_i64x4(a).as_i64x8();
    let zero = _mm512_setzero_si512().as_i64x8();
    transmute(simd_select_bitmask(k, broadcast, zero))
}

/// Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_broadcast_f32x4&expand=483)
#[inline]
#[target_feature(enable = "avx512f")] //msvc: vbroadcastf32x4, linux: vshuf
pub unsafe fn _mm512_broadcast_f32x4(a: __m128) -> __m512 {
    simd_shuffle16!(a, a, [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3])
}

/// Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_broadcast_f32x4&expand=484)
#[inline]
#[target_feature(enable = "avx512f")] //msvc: vbroadcastf32x4, linux: vshu
pub unsafe fn _mm512_mask_broadcast_f32x4(src: __m512, k: __mmask16, a: __m128) -> __m512 {
    let broadcast = _mm512_broadcast_f32x4(a).as_f32x16();
    transmute(simd_select_bitmask(k, broadcast, src.as_f32x16()))
}

/// Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_broadcast_f32x4&expand=485)
#[inline]
#[target_feature(enable = "avx512f")] //msvc: vbroadcastf32x4, linux: vshu
pub unsafe fn _mm512_maskz_broadcast_f32x4(k: __mmask16, a: __m128) -> __m512 {
    let broadcast = _mm512_broadcast_f32x4(a).as_f32x16();
    let zero = _mm512_setzero_ps().as_f32x16();
    transmute(simd_select_bitmask(k, broadcast, zero))
}

/// Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_broadcast_f32x4&expand=480)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")] //msvc: vbroadcastf32x4, linux: vshuf
pub unsafe fn _mm256_broadcast_f32x4(a: __m128) -> __m256 {
    simd_shuffle8!(a, a, [0, 1, 2, 3, 0, 1, 2, 3])
}

/// Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_broadcast_f32x4&expand=481)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")] //msvc: vbroadcastf32x4, linux: vshu
pub unsafe fn _mm256_mask_broadcast_f32x4(src: __m256, k: __mmask8, a: __m128) -> __m256 {
    let broadcast = _mm256_broadcast_f32x4(a).as_f32x8();
    transmute(simd_select_bitmask(k, broadcast, src.as_f32x8()))
}

/// Broadcast the 4 packed single-precision (32-bit) floating-point elements from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_broadcast_f32x4&expand=482)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")] //msvc: vbroadcastf32x4, linux: vshu
pub unsafe fn _mm256_maskz_broadcast_f32x4(k: __mmask8, a: __m128) -> __m256 {
    let broadcast = _mm256_broadcast_f32x4(a).as_f32x8();
    let zero = _mm256_setzero_ps().as_f32x8();
    transmute(simd_select_bitmask(k, broadcast, zero))
}

/// Broadcast the 4 packed double-precision (64-bit) floating-point elements from a to all elements of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_broadcast_f64x4&expand=495)
#[inline]
#[target_feature(enable = "avx512f")] //msvc: vbroadcastf64x4, linux: vperm
pub unsafe fn _mm512_broadcast_f64x4(a: __m256d) -> __m512d {
    simd_shuffle8!(a, a, [0, 1, 2, 3, 0, 1, 2, 3])
}

/// Broadcast the 4 packed double-precision (64-bit) floating-point elements from a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_broadcast_f64x4&expand=496)
#[inline]
#[target_feature(enable = "avx512f")] //msvc: vbroadcastf64x4, linux: vper
pub unsafe fn _mm512_mask_broadcast_f64x4(src: __m512d, k: __mmask8, a: __m256d) -> __m512d {
    let broadcast = _mm512_broadcast_f64x4(a).as_f64x8();
    transmute(simd_select_bitmask(k, broadcast, src.as_f64x8()))
}

/// Broadcast the 4 packed double-precision (64-bit) floating-point elements from a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_broadcast_f64x4&expand=497)
#[inline]
#[target_feature(enable = "avx512f")] //msvc: vbroadcastf64x4, linux: vper
pub unsafe fn _mm512_maskz_broadcast_f64x4(k: __mmask8, a: __m256d) -> __m512d {
    let broadcast = _mm512_broadcast_f64x4(a).as_f64x8();
    let zero = _mm512_setzero_pd().as_f64x8();
    transmute(simd_select_bitmask(k, broadcast, zero))
}

/// Blend packed 32-bit integers from a and b using control mask k, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_epi32&expand=435)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmovdqa32))] //should be vpblendmd
pub unsafe fn _mm512_mask_blend_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
    transmute(simd_select_bitmask(k, b.as_i32x16(), a.as_i32x16()))
}

/// Blend packed 32-bit integers from a and b using control mask k, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_blend_epi32&expand=434)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmovdqa32))] //should be vpblendmd
pub unsafe fn _mm256_mask_blend_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
    transmute(simd_select_bitmask(k, b.as_i32x8(), a.as_i32x8()))
}

/// Blend packed 32-bit integers from a and b using control mask k, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_blend_epi32&expand=432)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmovdqa32))] //should be vpblendmd
pub unsafe fn _mm_mask_blend_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    transmute(simd_select_bitmask(k, b.as_i32x4(), a.as_i32x4()))
}

/// Blend packed 64-bit integers from a and b using control mask k, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_epi64&expand=438)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmovdqa64))] //should be vpblendmq
pub unsafe fn _mm512_mask_blend_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
    transmute(simd_select_bitmask(k, b.as_i64x8(), a.as_i64x8()))
}

/// Blend packed 64-bit integers from a and b using control mask k, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_blend_epi64&expand=437)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmovdqa64))] //should be vpblendmq
pub unsafe fn _mm256_mask_blend_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
    transmute(simd_select_bitmask(k, b.as_i64x4(), a.as_i64x4()))
}

/// Blend packed 64-bit integers from a and b using control mask k, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_blend_epi64&expand=436)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmovdqa64))] //should be vpblendmq
pub unsafe fn _mm_mask_blend_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    transmute(simd_select_bitmask(k, b.as_i64x2(), a.as_i64x2()))
}

/// Blend packed single-precision (32-bit) floating-point elements from a and b using control mask k, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_ps&expand=451)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmovaps))] //should be vpblendmps
pub unsafe fn _mm512_mask_blend_ps(k: __mmask16, a: __m512, b: __m512) -> __m512 {
    transmute(simd_select_bitmask(k, b.as_f32x16(), a.as_f32x16()))
}

/// Blend packed single-precision (32-bit) floating-point elements from a and b using control mask k, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_blend_ps&expand=450)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmovaps))] //should be vpblendmps
pub unsafe fn _mm256_mask_blend_ps(k: __mmask8, a: __m256, b: __m256) -> __m256 {
    transmute(simd_select_bitmask(k, b.as_f32x8(), a.as_f32x8()))
}

/// Blend packed single-precision (32-bit) floating-point elements from a and b using control mask k, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_blend_ps&expand=448)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmovaps))] //should be vpblendmps
pub unsafe fn _mm_mask_blend_ps(k: __mmask8, a: __m128, b: __m128) -> __m128 {
    transmute(simd_select_bitmask(k, b.as_f32x4(), a.as_f32x4()))
}

/// Blend packed double-precision (64-bit) floating-point elements from a and b using control mask k, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_blend_pd&expand=446)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmovapd))] //should be vpblendmpd
pub unsafe fn _mm512_mask_blend_pd(k: __mmask8, a: __m512d, b: __m512d) -> __m512d {
    transmute(simd_select_bitmask(k, b.as_f64x8(), a.as_f64x8()))
}

/// Blend packed double-precision (64-bit) floating-point elements from a and b using control mask k, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_blend_pd&expand=445)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmovapd))] //should be vpblendmpd
pub unsafe fn _mm256_mask_blend_pd(k: __mmask8, a: __m256d, b: __m256d) -> __m256d {
    transmute(simd_select_bitmask(k, b.as_f64x4(), a.as_f64x4()))
}

/// Blend packed double-precision (64-bit) floating-point elements from a and b using control mask k, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_blend_pd&expand=443)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmovapd))] //should be vpblendmpd
pub unsafe fn _mm_mask_blend_pd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
    transmute(simd_select_bitmask(k, b.as_f64x2(), a.as_f64x2()))
}

/// Concatenate a and b into a 128-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 64 bytes (16 elements) in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_alignr_epi32&expand=245)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(valignd, IMM8 = 1))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_alignr_epi32<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
    static_assert_imm8!(IMM8);
    let a = a.as_i32x16();
    let b = b.as_i32x16();
    let imm8: i32 = IMM8 % 16;
    let r: i32x16 = match imm8 {
        0 => simd_shuffle16!(
            a,
            b,
            [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,],
        ),
        1 => simd_shuffle16!(
            a,
            b,
            [17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0,],
        ),
        2 => simd_shuffle16!(
            a,
            b,
            [18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1],
        ),
        3 => simd_shuffle16!(
            a,
            b,
            [19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2],
        ),
        4 => simd_shuffle16!(
            a,
            b,
            [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3],
        ),
        5 => simd_shuffle16!(
            a,
            b,
            [21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4],
        ),
        6 => simd_shuffle16!(
            a,
            b,
            [22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5],
        ),
        7 => simd_shuffle16!(
            a,
            b,
            [23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6],
        ),
        8 => simd_shuffle16!(
            a,
            b,
            [24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7],
        ),
        9 => simd_shuffle16!(
            a,
            b,
            [25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8],
        ),
        10 => simd_shuffle16!(a, b, [26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
        11 => simd_shuffle16!(a, b, [27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]),
        12 => simd_shuffle16!(a, b, [28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]),
        13 => simd_shuffle16!(a, b, [29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]),
        14 => simd_shuffle16!(a, b, [30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]),
        _ => simd_shuffle16!(a, b, [31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]),
    };
    transmute(r)
}

/// Concatenate a and b into a 128-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 64 bytes (16 elements) in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_alignr_epi32&expand=246)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(valignd, IMM8 = 1))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask_alignr_epi32<const IMM8: i32>(
    src: __m512i,
    k: __mmask16,
    a: __m512i,
    b: __m512i,
) -> __m512i {
    static_assert_imm8!(IMM8);
    let r = _mm512_alignr_epi32::<IMM8>(a, b);
    transmute(simd_select_bitmask(k, r.as_i32x16(), src.as_i32x16()))
}

/// Concatenate a and b into a 128-byte immediate result, shift the result right by imm8 32-bit elements, and stores the low 64 bytes (16 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_alignr_epi32&expand=247)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(valignd, IMM8 = 1))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_maskz_alignr_epi32<const IMM8: i32>(
    k: __mmask16,
    a: __m512i,
    b: __m512i,
) -> __m512i {
    static_assert_imm8!(IMM8);
    let r = _mm512_alignr_epi32::<IMM8>(a, b);
    let zero = _mm512_setzero_si512().as_i32x16();
    transmute(simd_select_bitmask(k, r.as_i32x16(), zero))
}

/// Concatenate a and b into a 64-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 32 bytes (8 elements) in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_alignr_epi32&expand=242)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(valignd, IMM8 = 1))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm256_alignr_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
    static_assert_imm8!(IMM8);
    let a = a.as_i32x8();
    let b = b.as_i32x8();
    let imm8: i32 = IMM8 % 16;
    let r: i32x8 = match imm8 {
        0 => simd_shuffle8!(a, b, [8, 9, 10, 11, 12, 13, 14, 15]),
        1 => simd_shuffle8!(a, b, [9, 10, 11, 12, 13, 14, 15, 0]),
        2 => simd_shuffle8!(a, b, [10, 11, 12, 13, 14, 15, 0, 1]),
        3 => simd_shuffle8!(a, b, [11, 12, 13, 14, 15, 0, 1, 2]),
        4 => simd_shuffle8!(a, b, [12, 13, 14, 15, 0, 1, 2, 3]),
        5 => simd_shuffle8!(a, b, [13, 14, 15, 0, 1, 2, 3, 4]),
        6 => simd_shuffle8!(a, b, [14, 15, 0, 1, 2, 3, 4, 5]),
        7 => simd_shuffle8!(a, b, [15, 0, 1, 2, 3, 4, 5, 6]),
        8 => simd_shuffle8!(a, b, [0, 1, 2, 3, 4, 5, 6, 7]),
        9 => simd_shuffle8!(a, b, [1, 2, 3, 4, 5, 6, 7, 8]),
        10 => simd_shuffle8!(a, b, [2, 3, 4, 5, 6, 7, 8, 9]),
        11 => simd_shuffle8!(a, b, [3, 4, 5, 6, 7, 8, 9, 10]),
        12 => simd_shuffle8!(a, b, [4, 5, 6, 7, 8, 9, 10, 11]),
        13 => simd_shuffle8!(a, b, [5, 6, 7, 8, 9, 10, 11, 12]),
        14 => simd_shuffle8!(a, b, [6, 7, 8, 9, 10, 11, 12, 13]),
        _ => simd_shuffle8!(a, b, [7, 8, 9, 10, 11, 12, 13, 14]),
    };
    transmute(r)
}

/// Concatenate a and b into a 64-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 32 bytes (8 elements) in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_alignr_epi32&expand=243)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(valignd, IMM8 = 1))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm256_mask_alignr_epi32<const IMM8: i32>(
    src: __m256i,
    k: __mmask8,
    a: __m256i,
    b: __m256i,
) -> __m256i {
    static_assert_imm8!(IMM8);
    let r = _mm256_alignr_epi32::<IMM8>(a, b);
    transmute(simd_select_bitmask(k, r.as_i32x8(), src.as_i32x8()))
}

/// Concatenate a and b into a 64-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 32 bytes (8 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_alignr_epi32&expand=244)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(valignd, IMM8 = 1))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm256_maskz_alignr_epi32<const IMM8: i32>(
    k: __mmask8,
    a: __m256i,
    b: __m256i,
) -> __m256i {
    static_assert_imm8!(IMM8);
    let r = _mm256_alignr_epi32::<IMM8>(a, b);
    let zero = _mm256_setzero_si256().as_i32x8();
    transmute(simd_select_bitmask(k, r.as_i32x8(), zero))
}

/// Concatenate a and b into a 32-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 16 bytes (4 elements) in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_epi32&expand=239)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 1))] //should be valignd
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm_alignr_epi32<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
    static_assert_imm8!(IMM8);
    let a = a.as_i32x4();
    let b = b.as_i32x4();
    let imm8: i32 = IMM8 % 8;
    let r: i32x4 = match imm8 {
        0 => simd_shuffle4!(a, b, [4, 5, 6, 7]),
        1 => simd_shuffle4!(a, b, [5, 6, 7, 0]),
        2 => simd_shuffle4!(a, b, [6, 7, 0, 1]),
        3 => simd_shuffle4!(a, b, [7, 0, 1, 2]),
        4 => simd_shuffle4!(a, b, [0, 1, 2, 3]),
        5 => simd_shuffle4!(a, b, [1, 2, 3, 0]),
        6 => simd_shuffle4!(a, b, [2, 3, 0, 1]),
        _ => simd_shuffle4!(a, b, [3, 0, 1, 2]),
    };
    transmute(r)
}

/// Concatenate a and b into a 32-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 16 bytes (4 elements) in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_alignr_epi32&expand=240)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(valignd, IMM8 = 1))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_mask_alignr_epi32<const IMM8: i32>(
    src: __m128i,
    k: __mmask8,
    a: __m128i,
    b: __m128i,
) -> __m128i {
    static_assert_imm8!(IMM8);
    let r = _mm_alignr_epi32::<IMM8>(a, b);
    transmute(simd_select_bitmask(k, r.as_i32x4(), src.as_i32x4()))
}

/// Concatenate a and b into a 32-byte immediate result, shift the result right by imm8 32-bit elements, and store the low 16 bytes (4 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_alignr_epi32&expand=241)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(valignd, IMM8 = 1))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm_maskz_alignr_epi32<const IMM8: i32>(
    k: __mmask8,
    a: __m128i,
    b: __m128i,
) -> __m128i {
    static_assert_imm8!(IMM8);
    let r = _mm_alignr_epi32::<IMM8>(a, b);
    let zero = _mm_setzero_si128().as_i32x4();
    transmute(simd_select_bitmask(k, r.as_i32x4(), zero))
}

/// Concatenate a and b into a 128-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 64 bytes (8 elements) in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_alignr_epi64&expand=254)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(valignq, IMM8 = 1))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm512_alignr_epi64<const IMM8: i32>(a: __m512i, b: __m512i) -> __m512i {
    static_assert_imm8!(IMM8);
    let imm8: i32 = IMM8 % 8;
    let r: i64x8 = match imm8 {
        0 => simd_shuffle8!(a, b, [8, 9, 10, 11, 12, 13, 14, 15]),
        1 => simd_shuffle8!(a, b, [9, 10, 11, 12, 13, 14, 15, 0]),
        2 => simd_shuffle8!(a, b, [10, 11, 12, 13, 14, 15, 0, 1]),
        3 => simd_shuffle8!(a, b, [11, 12, 13, 14, 15, 0, 1, 2]),
        4 => simd_shuffle8!(a, b, [12, 13, 14, 15, 0, 1, 2, 3]),
        5 => simd_shuffle8!(a, b, [13, 14, 15, 0, 1, 2, 3, 4]),
        6 => simd_shuffle8!(a, b, [14, 15, 0, 1, 2, 3, 4, 5]),
        _ => simd_shuffle8!(a, b, [15, 0, 1, 2, 3, 4, 5, 6]),
    };
    transmute(r)
}

/// Concatenate a and b into a 128-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 64 bytes (8 elements) in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask_alignr_epi64&expand=255)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(valignq, IMM8 = 1))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm512_mask_alignr_epi64<const IMM8: i32>(
    src: __m512i,
    k: __mmask8,
    a: __m512i,
    b: __m512i,
) -> __m512i {
    static_assert_imm8!(IMM8);
    let r = _mm512_alignr_epi64::<IMM8>(a, b);
    transmute(simd_select_bitmask(k, r.as_i64x8(), src.as_i64x8()))
}

/// Concatenate a and b into a 128-byte immediate result, shift the result right by imm8 64-bit elements, and stores the low 64 bytes (8 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_maskz_alignr_epi64&expand=256)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(valignq, IMM8 = 1))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm512_maskz_alignr_epi64<const IMM8: i32>(
    k: __mmask8,
    a: __m512i,
    b: __m512i,
) -> __m512i {
    static_assert_imm8!(IMM8);
    let r = _mm512_alignr_epi64::<IMM8>(a, b);
    let zero = _mm512_setzero_si512().as_i64x8();
    transmute(simd_select_bitmask(k, r.as_i64x8(), zero))
}

/// Concatenate a and b into a 64-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 32 bytes (4 elements) in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_alignr_epi64&expand=251)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(valignq, IMM8 = 1))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm256_alignr_epi64<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
    static_assert_imm8!(IMM8);
    let imm8: i32 = IMM8 % 8;
    let r: i64x4 = match imm8 {
        0 => simd_shuffle4!(a, b, [4, 5, 6, 7]),
        1 => simd_shuffle4!(a, b, [5, 6, 7, 0]),
        2 => simd_shuffle4!(a, b, [6, 7, 0, 1]),
        3 => simd_shuffle4!(a, b, [7, 0, 1, 2]),
        4 => simd_shuffle4!(a, b, [0, 1, 2, 3]),
        5 => simd_shuffle4!(a, b, [1, 2, 3, 4]),
        6 => simd_shuffle4!(a, b, [2, 3, 4, 5]),
        _ => simd_shuffle4!(a, b, [3, 4, 5, 6]),
    };
    transmute(r)
}

/// Concatenate a and b into a 64-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 32 bytes (4 elements) in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_alignr_epi64&expand=252)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(valignq, IMM8 = 1))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm256_mask_alignr_epi64<const IMM8: i32>(
    src: __m256i,
    k: __mmask8,
    a: __m256i,
    b: __m256i,
) -> __m256i {
    static_assert_imm8!(IMM8);
    let r = _mm256_alignr_epi64::<IMM8>(a, b);
    transmute(simd_select_bitmask(k, r.as_i64x4(), src.as_i64x4()))
}

/// Concatenate a and b into a 64-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 32 bytes (4 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_alignr_epi64&expand=253)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(valignq, IMM8 = 1))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm256_maskz_alignr_epi64<const IMM8: i32>(
    k: __mmask8,
    a: __m256i,
    b: __m256i,
) -> __m256i {
    static_assert_imm8!(IMM8);
    let r = _mm256_alignr_epi64::<IMM8>(a, b);
    let zero = _mm256_setzero_si256().as_i64x4();
    transmute(simd_select_bitmask(k, r.as_i64x4(), zero))
}

/// Concatenate a and b into a 32-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 16 bytes (2 elements) in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_epi64&expand=248)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 1))] //should be valignq
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm_alignr_epi64<const IMM8: i32>(a: __m128i, b: __m128i) -> __m128i {
    static_assert_imm8!(IMM8);
    let imm8: i32 = IMM8 % 4;
    let r: i64x2 = match imm8 {
        0 => simd_shuffle2!(a, b, [2, 3]),
        1 => simd_shuffle2!(a, b, [3, 0]),
        2 => simd_shuffle2!(a, b, [0, 1]),
        _ => simd_shuffle2!(a, b, [1, 2]),
    };
    transmute(r)
}

/// Concatenate a and b into a 32-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 16 bytes (2 elements) in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_alignr_epi64&expand=249)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(valignq, IMM8 = 1))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_mask_alignr_epi64<const IMM8: i32>(
    src: __m128i,
    k: __mmask8,
    a: __m128i,
    b: __m128i,
) -> __m128i {
    static_assert_imm8!(IMM8);
    let r = _mm_alignr_epi64::<IMM8>(a, b);
    transmute(simd_select_bitmask(k, r.as_i64x2(), src.as_i64x2()))
}

/// Concatenate a and b into a 32-byte immediate result, shift the result right by imm8 64-bit elements, and store the low 16 bytes (2 elements) in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_alignr_epi64&expand=250)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(valignq, IMM8 = 1))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm_maskz_alignr_epi64<const IMM8: i32>(
    k: __mmask8,
    a: __m128i,
    b: __m128i,
) -> __m128i {
    static_assert_imm8!(IMM8);
    let r = _mm_alignr_epi64::<IMM8>(a, b);
    let zero = _mm_setzero_si128().as_i64x2();
    transmute(simd_select_bitmask(k, r.as_i64x2(), zero))
}

/// Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_and_epi32&expand=272)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpandq))] //should be vpandd, but generate vpandq
pub unsafe fn _mm512_and_epi32(a: __m512i, b: __m512i) -> __m512i {
    transmute(simd_and(a.as_i32x16(), b.as_i32x16()))
}

/// Performs element-by-element bitwise AND between packed 32-bit integer elements of a and b, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_and_epi32&expand=273)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpandd))]
pub unsafe fn _mm512_mask_and_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
    let and = _mm512_and_epi32(a, b).as_i32x16();
    transmute(simd_select_bitmask(k, and, src.as_i32x16()))
}

/// Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_and_epi32&expand=274)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpandd))]
pub unsafe fn _mm512_maskz_and_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
    let and = _mm512_and_epi32(a, b).as_i32x16();
    let zero = _mm512_setzero_si512().as_i32x16();
    transmute(simd_select_bitmask(k, and, zero))
}

/// Performs element-by-element bitwise AND between packed 32-bit integer elements of a and b, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_and_epi32&expand=270)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpandd))]
pub unsafe fn _mm256_mask_and_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
    let and = simd_and(a.as_i32x8(), b.as_i32x8());
    transmute(simd_select_bitmask(k, and, src.as_i32x8()))
}

/// Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_and_epi32&expand=271)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpandd))]
pub unsafe fn _mm256_maskz_and_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
    let and = simd_and(a.as_i32x8(), b.as_i32x8());
    let zero = _mm256_setzero_si256().as_i32x8();
    transmute(simd_select_bitmask(k, and, zero))
}

/// Performs element-by-element bitwise AND between packed 32-bit integer elements of a and b, storing the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_and_epi32&expand=268)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpandd))]
pub unsafe fn _mm_mask_and_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    let and = simd_and(a.as_i32x4(), b.as_i32x4());
    transmute(simd_select_bitmask(k, and, src.as_i32x4()))
}

/// Compute the bitwise AND of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_and_epi32&expand=269)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpandd))]
pub unsafe fn _mm_maskz_and_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    let and = simd_and(a.as_i32x4(), b.as_i32x4());
    let zero = _mm_setzero_si128().as_i32x4();
    transmute(simd_select_bitmask(k, and, zero))
}

/// Compute the bitwise AND of 512 bits (composed of packed 64-bit integers) in a and b, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_and_epi64&expand=279)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpandq))]
pub unsafe fn _mm512_and_epi64(a: __m512i, b: __m512i) -> __m512i {
    transmute(simd_and(a.as_i64x8(), b.as_i64x8()))
}

/// Compute the bitwise AND of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_and_epi64&expand=280)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpandq))]
pub unsafe fn _mm512_mask_and_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
    let and = _mm512_and_epi64(a, b).as_i64x8();
    transmute(simd_select_bitmask(k, and, src.as_i64x8()))
}

/// Compute the bitwise AND of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_and_epi64&expand=281)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpandq))]
pub unsafe fn _mm512_maskz_and_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
    let and = _mm512_and_epi64(a, b).as_i64x8();
    let zero = _mm512_setzero_si512().as_i64x8();
    transmute(simd_select_bitmask(k, and, zero))
}

/// Compute the bitwise AND of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_and_epi64&expand=277)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpandq))]
pub unsafe fn _mm256_mask_and_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
    let and = simd_and(a.as_i64x4(), b.as_i64x4());
    transmute(simd_select_bitmask(k, and, src.as_i64x4()))
}

/// Compute the bitwise AND of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_and_epi64&expand=278)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpandq))]
pub unsafe fn _mm256_maskz_and_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
    let and = simd_and(a.as_i64x4(), b.as_i64x4());
    let zero = _mm256_setzero_si256().as_i64x4();
    transmute(simd_select_bitmask(k, and, zero))
}

/// Compute the bitwise AND of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_and_epi64&expand=275)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpandq))]
pub unsafe fn _mm_mask_and_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    let and = simd_and(a.as_i64x2(), b.as_i64x2());
    transmute(simd_select_bitmask(k, and, src.as_i64x2()))
}

/// Compute the bitwise AND of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_and_epi64&expand=276)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpandq))]
pub unsafe fn _mm_maskz_and_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    let and = simd_and(a.as_i64x2(), b.as_i64x2());
    let zero = _mm_setzero_si128().as_i64x2();
    transmute(simd_select_bitmask(k, and, zero))
}

/// Compute the bitwise AND of 512 bits (representing integer data) in a and b, and store the result in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_and_si512&expand=302)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpandq))]
pub unsafe fn _mm512_and_si512(a: __m512i, b: __m512i) -> __m512i {
    transmute(simd_and(a.as_i32x16(), b.as_i32x16()))
}

/// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_or_epi32&expand=4042)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vporq))]
pub unsafe fn _mm512_or_epi32(a: __m512i, b: __m512i) -> __m512i {
    transmute(simd_or(a.as_i32x16(), b.as_i32x16()))
}

/// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_or_epi32&expand=4040)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpord))]
pub unsafe fn _mm512_mask_or_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
    let or = _mm512_or_epi32(a, b).as_i32x16();
    transmute(simd_select_bitmask(k, or, src.as_i32x16()))
}

/// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_or_epi32&expand=4041)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpord))]
pub unsafe fn _mm512_maskz_or_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
    let or = _mm512_or_epi32(a, b).as_i32x16();
    let zero = _mm512_setzero_si512().as_i32x16();
    transmute(simd_select_bitmask(k, or, zero))
}

/// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_or_epi32&expand=4039)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vor))] //should be vpord
pub unsafe fn _mm256_or_epi32(a: __m256i, b: __m256i) -> __m256i {
    transmute(simd_or(a.as_i32x8(), b.as_i32x8()))
}

/// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_or_epi32&expand=4037)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpord))]
pub unsafe fn _mm256_mask_or_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
    let or = _mm256_or_epi32(a, b).as_i32x8();
    transmute(simd_select_bitmask(k, or, src.as_i32x8()))
}

/// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_or_epi32&expand=4038)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpord))]
pub unsafe fn _mm256_maskz_or_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
    let or = _mm256_or_epi32(a, b).as_i32x8();
    let zero = _mm256_setzero_si256().as_i32x8();
    transmute(simd_select_bitmask(k, or, zero))
}

/// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_or_epi32&expand=4036)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vor))] //should be vpord
pub unsafe fn _mm_or_epi32(a: __m128i, b: __m128i) -> __m128i {
    transmute(simd_or(a.as_i32x4(), b.as_i32x4()))
}

/// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_or_epi32&expand=4034)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpord))]
pub unsafe fn _mm_mask_or_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    let or = _mm_or_epi32(a, b).as_i32x4();
    transmute(simd_select_bitmask(k, or, src.as_i32x4()))
}

/// Compute the bitwise OR of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_or_epi32&expand=4035)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpord))]
pub unsafe fn _mm_maskz_or_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    let or = _mm_or_epi32(a, b).as_i32x4();
    let zero = _mm_setzero_si128().as_i32x4();
    transmute(simd_select_bitmask(k, or, zero))
}

/// Compute the bitwise OR of packed 64-bit integers in a and b, and store the resut in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_or_epi64&expand=4051)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vporq))]
pub unsafe fn _mm512_or_epi64(a: __m512i, b: __m512i) -> __m512i {
    transmute(simd_or(a.as_i64x8(), b.as_i64x8()))
}

/// Compute the bitwise OR of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_or_epi64&expand=4049)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vporq))]
pub unsafe fn _mm512_mask_or_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
    let or = _mm512_or_epi64(a, b).as_i64x8();
    transmute(simd_select_bitmask(k, or, src.as_i64x8()))
}

/// Compute the bitwise OR of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_or_epi64&expand=4050)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vporq))]
pub unsafe fn _mm512_maskz_or_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
    let or = _mm512_or_epi64(a, b).as_i64x8();
    let zero = _mm512_setzero_si512().as_i64x8();
    transmute(simd_select_bitmask(k, or, zero))
}

/// Compute the bitwise OR of packed 64-bit integers in a and b, and store the resut in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_or_epi64&expand=4048)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vor))] //should be vporq
pub unsafe fn _mm256_or_epi64(a: __m256i, b: __m256i) -> __m256i {
    transmute(simd_or(a.as_i64x4(), b.as_i64x4()))
}

/// Compute the bitwise OR of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_or_epi64&expand=4046)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vporq))]
pub unsafe fn _mm256_mask_or_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
    let or = _mm256_or_epi64(a, b).as_i64x4();
    transmute(simd_select_bitmask(k, or, src.as_i64x4()))
}

/// Compute the bitwise OR of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_or_epi64&expand=4047)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vporq))]
pub unsafe fn _mm256_maskz_or_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
    let or = _mm256_or_epi64(a, b).as_i64x4();
    let zero = _mm256_setzero_si256().as_i64x4();
    transmute(simd_select_bitmask(k, or, zero))
}

/// Compute the bitwise OR of packed 64-bit integers in a and b, and store the resut in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_or_epi64&expand=4045)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vor))] //should be vporq
pub unsafe fn _mm_or_epi64(a: __m128i, b: __m128i) -> __m128i {
    transmute(simd_or(a.as_i64x2(), b.as_i64x2()))
}

/// Compute the bitwise OR of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_or_epi64&expand=4043)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vporq))]
pub unsafe fn _mm_mask_or_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    let or = _mm_or_epi64(a, b).as_i64x2();
    transmute(simd_select_bitmask(k, or, src.as_i64x2()))
}

/// Compute the bitwise OR of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_or_epi64&expand=4044)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vporq))]
pub unsafe fn _mm_maskz_or_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    let or = _mm_or_epi64(a, b).as_i64x2();
    let zero = _mm_setzero_si128().as_i64x2();
    transmute(simd_select_bitmask(k, or, zero))
}

/// Compute the bitwise OR of 512 bits (representing integer data) in a and b, and store the result in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_or_si512&expand=4072)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vporq))]
pub unsafe fn _mm512_or_si512(a: __m512i, b: __m512i) -> __m512i {
    transmute(simd_or(a.as_i32x16(), b.as_i32x16()))
}

/// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_xor_epi32&expand=6142)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpxorq))] //should be vpxord
pub unsafe fn _mm512_xor_epi32(a: __m512i, b: __m512i) -> __m512i {
    transmute(simd_xor(a.as_i32x16(), b.as_i32x16()))
}

/// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_xor_epi32&expand=6140)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpxord))]
pub unsafe fn _mm512_mask_xor_epi32(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
    let xor = _mm512_xor_epi32(a, b).as_i32x16();
    transmute(simd_select_bitmask(k, xor, src.as_i32x16()))
}

/// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_xor_epi32&expand=6141)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpxord))]
pub unsafe fn _mm512_maskz_xor_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
    let xor = _mm512_xor_epi32(a, b).as_i32x16();
    let zero = _mm512_setzero_si512().as_i32x16();
    transmute(simd_select_bitmask(k, xor, zero))
}

/// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_xor_epi32&expand=6139)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vxor))] //should be vpxord
pub unsafe fn _mm256_xor_epi32(a: __m256i, b: __m256i) -> __m256i {
    transmute(simd_xor(a.as_i32x8(), b.as_i32x8()))
}

/// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_xor_epi32&expand=6137)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpxord))]
pub unsafe fn _mm256_mask_xor_epi32(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
    let xor = _mm256_xor_epi32(a, b).as_i32x8();
    transmute(simd_select_bitmask(k, xor, src.as_i32x8()))
}

/// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_xor_epi32&expand=6138)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpxord))]
pub unsafe fn _mm256_maskz_xor_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
    let xor = _mm256_xor_epi32(a, b).as_i32x8();
    let zero = _mm256_setzero_si256().as_i32x8();
    transmute(simd_select_bitmask(k, xor, zero))
}

/// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_epi32&expand=6136)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vxor))] //should be vpxord
pub unsafe fn _mm_xor_epi32(a: __m128i, b: __m128i) -> __m128i {
    transmute(simd_xor(a.as_i32x4(), b.as_i32x4()))
}

/// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_xor_epi32&expand=6134)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpxord))]
pub unsafe fn _mm_mask_xor_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    let xor = _mm_xor_epi32(a, b).as_i32x4();
    transmute(simd_select_bitmask(k, xor, src.as_i32x4()))
}

/// Compute the bitwise XOR of packed 32-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_xor_epi32&expand=6135)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpxord))]
pub unsafe fn _mm_maskz_xor_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    let xor = _mm_xor_epi32(a, b).as_i32x4();
    let zero = _mm_setzero_si128().as_i32x4();
    transmute(simd_select_bitmask(k, xor, zero))
}

/// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_xor_epi64&expand=6151)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpxorq))]
pub unsafe fn _mm512_xor_epi64(a: __m512i, b: __m512i) -> __m512i {
    transmute(simd_xor(a.as_i64x8(), b.as_i64x8()))
}

/// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_xor_epi64&expand=6149)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpxorq))]
pub unsafe fn _mm512_mask_xor_epi64(src: __m512i, k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
    let xor = _mm512_xor_epi64(a, b).as_i64x8();
    transmute(simd_select_bitmask(k, xor, src.as_i64x8()))
}

/// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_xor_epi64&expand=6150)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpxorq))]
pub unsafe fn _mm512_maskz_xor_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
    let xor = _mm512_xor_epi64(a, b).as_i64x8();
    let zero = _mm512_setzero_si512().as_i64x8();
    transmute(simd_select_bitmask(k, xor, zero))
}

/// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_xor_epi64&expand=6148)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vxor))] //should be vpxorq
pub unsafe fn _mm256_xor_epi64(a: __m256i, b: __m256i) -> __m256i {
    transmute(simd_xor(a.as_i64x4(), b.as_i64x4()))
}

/// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_xor_epi64&expand=6146)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpxorq))]
pub unsafe fn _mm256_mask_xor_epi64(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
    let xor = _mm256_xor_epi64(a, b).as_i64x4();
    transmute(simd_select_bitmask(k, xor, src.as_i64x4()))
}

/// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_xor_epi64&expand=6147)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpxorq))]
pub unsafe fn _mm256_maskz_xor_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
    let xor = _mm256_xor_epi64(a, b).as_i64x4();
    let zero = _mm256_setzero_si256().as_i64x4();
    transmute(simd_select_bitmask(k, xor, zero))
}

/// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_epi64&expand=6145)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vxor))] //should be vpxorq
pub unsafe fn _mm_xor_epi64(a: __m128i, b: __m128i) -> __m128i {
    transmute(simd_xor(a.as_i64x2(), b.as_i64x2()))
}

/// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_xor_epi64&expand=6143)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpxorq))]
pub unsafe fn _mm_mask_xor_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    let xor = _mm_xor_epi64(a, b).as_i64x2();
    transmute(simd_select_bitmask(k, xor, src.as_i64x2()))
}

/// Compute the bitwise XOR of packed 64-bit integers in a and b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_xor_epi64&expand=6144)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpxorq))]
pub unsafe fn _mm_maskz_xor_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    let xor = _mm_xor_epi64(a, b).as_i64x2();
    let zero = _mm_setzero_si128().as_i64x2();
    transmute(simd_select_bitmask(k, xor, zero))
}

/// Compute the bitwise XOR of 512 bits (representing integer data) in a and b, and store the result in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_xor_si512&expand=6172)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpxorq))]
pub unsafe fn _mm512_xor_si512(a: __m512i, b: __m512i) -> __m512i {
    transmute(simd_xor(a.as_i32x16(), b.as_i32x16()))
}

/// Compute the bitwise NOT of packed 32-bit integers in a and then AND with b, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_andnot_epi32&expand=310)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpandnq))] //should be vpandnd
pub unsafe fn _mm512_andnot_epi32(a: __m512i, b: __m512i) -> __m512i {
    _mm512_and_epi32(_mm512_xor_epi32(a, _mm512_set1_epi32(u32::MAX as i32)), b)
}

/// Compute the bitwise NOT of packed 32-bit integers in a and then AND with b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_andnot_epi32&expand=311)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpandnd))]
pub unsafe fn _mm512_mask_andnot_epi32(
    src: __m512i,
    k: __mmask16,
    a: __m512i,
    b: __m512i,
) -> __m512i {
    let andnot = _mm512_andnot_epi32(a, b).as_i32x16();
    transmute(simd_select_bitmask(k, andnot, src.as_i32x16()))
}

/// Compute the bitwise NOT of packed 32-bit integers in a and then AND with b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_andnot_epi32&expand=312)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpandnd))]
pub unsafe fn _mm512_maskz_andnot_epi32(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
    let andnot = _mm512_andnot_epi32(a, b).as_i32x16();
    let zero = _mm512_setzero_si512().as_i32x16();
    transmute(simd_select_bitmask(k, andnot, zero))
}

/// Compute the bitwise NOT of packed 32-bit integers in a and then AND with b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_andnot_epi32&expand=308)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpandnd))]
pub unsafe fn _mm256_mask_andnot_epi32(
    src: __m256i,
    k: __mmask8,
    a: __m256i,
    b: __m256i,
) -> __m256i {
    let not = _mm256_xor_epi32(a, _mm256_set1_epi32(u32::MAX as i32));
    let andnot = simd_and(not.as_i32x8(), b.as_i32x8());
    transmute(simd_select_bitmask(k, andnot, src.as_i32x8()))
}

/// Compute the bitwise NOT of packed 32-bit integers in a and then AND with b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_andnot_epi32&expand=309)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpandnd))]
pub unsafe fn _mm256_maskz_andnot_epi32(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
    let not = _mm256_xor_epi32(a, _mm256_set1_epi32(u32::MAX as i32));
    let andnot = simd_and(not.as_i32x8(), b.as_i32x8());
    let zero = _mm256_setzero_si256().as_i32x8();
    transmute(simd_select_bitmask(k, andnot, zero))
}

/// Compute the bitwise NOT of packed 32-bit integers in a and then AND with b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_andnot_epi32&expand=306)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpandnd))]
pub unsafe fn _mm_mask_andnot_epi32(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    let not = _mm_xor_epi32(a, _mm_set1_epi32(u32::MAX as i32));
    let andnot = simd_and(not.as_i32x4(), b.as_i32x4());
    transmute(simd_select_bitmask(k, andnot, src.as_i32x4()))
}

/// Compute the bitwise NOT of packed 32-bit integers in a and then AND with b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_andnot_epi32&expand=307)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpandnd))]
pub unsafe fn _mm_maskz_andnot_epi32(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    let not = _mm_xor_epi32(a, _mm_set1_epi32(u32::MAX as i32));
    let andnot = simd_and(not.as_i32x4(), b.as_i32x4());
    let zero = _mm_setzero_si128().as_i32x4();
    transmute(simd_select_bitmask(k, andnot, zero))
}

/// Compute the bitwise NOT of 512 bits (composed of packed 64-bit integers) in a and then AND with b, and store the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_andnot_epi64&expand=317)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpandnq))] //should be vpandnd
pub unsafe fn _mm512_andnot_epi64(a: __m512i, b: __m512i) -> __m512i {
    _mm512_and_epi64(_mm512_xor_epi64(a, _mm512_set1_epi64(u64::MAX as i64)), b)
}

/// Compute the bitwise NOT of packed 64-bit integers in a and then AND with b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_andnot_epi64&expand=318)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpandnq))]
pub unsafe fn _mm512_mask_andnot_epi64(
    src: __m512i,
    k: __mmask8,
    a: __m512i,
    b: __m512i,
) -> __m512i {
    let andnot = _mm512_andnot_epi64(a, b).as_i64x8();
    transmute(simd_select_bitmask(k, andnot, src.as_i64x8()))
}

/// Compute the bitwise NOT of packed 64-bit integers in a and then AND with b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_andnot_epi64&expand=319)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpandnq))]
pub unsafe fn _mm512_maskz_andnot_epi64(k: __mmask8, a: __m512i, b: __m512i) -> __m512i {
    let andnot = _mm512_andnot_epi64(a, b).as_i64x8();
    let zero = _mm512_setzero_si512().as_i64x8();
    transmute(simd_select_bitmask(k, andnot, zero))
}

/// Compute the bitwise NOT of packed 64-bit integers in a and then AND with b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_andnot_epi64&expand=315)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpandnq))]
pub unsafe fn _mm256_mask_andnot_epi64(
    src: __m256i,
    k: __mmask8,
    a: __m256i,
    b: __m256i,
) -> __m256i {
    let not = _mm256_xor_epi64(a, _mm256_set1_epi64x(u64::MAX as i64));
    let andnot = simd_and(not.as_i64x4(), b.as_i64x4());
    transmute(simd_select_bitmask(k, andnot, src.as_i64x4()))
}

/// Compute the bitwise NOT of packed 64-bit integers in a and then AND with b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_andnot_epi64&expand=316)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpandnq))]
pub unsafe fn _mm256_maskz_andnot_epi64(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
    let not = _mm256_xor_epi64(a, _mm256_set1_epi64x(u64::MAX as i64));
    let andnot = simd_and(not.as_i64x4(), b.as_i64x4());
    let zero = _mm256_setzero_si256().as_i64x4();
    transmute(simd_select_bitmask(k, andnot, zero))
}

/// Compute the bitwise NOT of packed 64-bit integers in a and then AND with b, and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_andnot_epi64&expand=313)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpandnq))]
pub unsafe fn _mm_mask_andnot_epi64(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    let not = _mm_xor_epi64(a, _mm_set1_epi64x(u64::MAX as i64));
    let andnot = simd_and(not.as_i64x2(), b.as_i64x2());
    transmute(simd_select_bitmask(k, andnot, src.as_i64x2()))
}

/// Compute the bitwise NOT of packed 64-bit integers in a and then AND with b, and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_andnot_epi64&expand=314)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpandnq))]
pub unsafe fn _mm_maskz_andnot_epi64(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
    let not = _mm_xor_epi64(a, _mm_set1_epi64x(u64::MAX as i64));
    let andnot = simd_and(not.as_i64x2(), b.as_i64x2());
    let zero = _mm_setzero_si128().as_i64x2();
    transmute(simd_select_bitmask(k, andnot, zero))
}

/// Compute the bitwise NOT of 512 bits (representing integer data) in a and then AND with b, and store the result in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_andnot_si512&expand=340)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpandnq))]
pub unsafe fn _mm512_andnot_si512(a: __m512i, b: __m512i) -> __m512i {
    _mm512_and_epi64(_mm512_xor_epi64(a, _mm512_set1_epi64(u64::MAX as i64)), b)
}

/// Compute the bitwise AND of 16-bit masks a and b, and store the result in k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=kand_mask16&expand=3212)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(and))] // generate normal and code instead of kandw
pub unsafe fn _kand_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
    transmute(a & b)
}

/// Compute the bitwise AND of 16-bit masks a and b, and store the result in k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_kand&expand=3210)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(and))] // generate normal and code instead of kandw
pub unsafe fn _mm512_kand(a: __mmask16, b: __mmask16) -> __mmask16 {
    transmute(a & b)
}

/// Compute the bitwise OR of 16-bit masks a and b, and store the result in k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=kor_mask16&expand=3239)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(or))] // generate normal or code instead of korw
pub unsafe fn _kor_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
    transmute(a | b)
}

/// Compute the bitwise OR of 16-bit masks a and b, and store the result in k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_kor&expand=3237)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(or))] // generate normal or code instead of korw
pub unsafe fn _mm512_kor(a: __mmask16, b: __mmask16) -> __mmask16 {
    transmute(a | b)
}

/// Compute the bitwise XOR of 16-bit masks a and b, and store the result in k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=kxor_mask16&expand=3291)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(xor))] // generate normal xor code instead of kxorw
pub unsafe fn _kxor_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
    transmute(a ^ b)
}

/// Compute the bitwise XOR of 16-bit masks a and b, and store the result in k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_kxor&expand=3289)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(xor))] // generate normal xor code instead of kxorw
pub unsafe fn _mm512_kxor(a: __mmask16, b: __mmask16) -> __mmask16 {
    transmute(a ^ b)
}

/// Compute the bitwise NOT of 16-bit mask a, and store the result in k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=knot_mask16&expand=3233)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _knot_mask16(a: __mmask16) -> __mmask16 {
    transmute(a ^ 0b11111111_11111111)
}

/// Compute the bitwise NOT of 16-bit mask a, and store the result in k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_knot&expand=3231)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_knot(a: __mmask16) -> __mmask16 {
    transmute(a ^ 0b11111111_11111111)
}

/// Compute the bitwise NOT of 16-bit masks a and then AND with b, and store the result in k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=kandn_mask16&expand=3218)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(not))] // generate normal and, not code instead of kandnw
pub unsafe fn _kandn_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
    _mm512_kand(_mm512_knot(a), b)
}

/// Compute the bitwise NOT of 16-bit masks a and then AND with b, and store the result in k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_kandn&expand=3216)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(not))] // generate normal and code instead of kandw
pub unsafe fn _mm512_kandn(a: __mmask16, b: __mmask16) -> __mmask16 {
    _mm512_kand(_mm512_knot(a), b)
}

/// Compute the bitwise XNOR of 16-bit masks a and b, and store the result in k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=kxnor_mask16&expand=3285)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(xor))] // generate normal xor, not code instead of kxnorw
pub unsafe fn _kxnor_mask16(a: __mmask16, b: __mmask16) -> __mmask16 {
    _mm512_knot(_mm512_kxor(a, b))
}

/// Compute the bitwise XNOR of 16-bit masks a and b, and store the result in k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_kxnor&expand=3283)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(xor))] // generate normal and code instead of kandw
pub unsafe fn _mm512_kxnor(a: __mmask16, b: __mmask16) -> __mmask16 {
    _mm512_knot(_mm512_kxor(a, b))
}

/// Copy 16-bit mask a to k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm512_kmov&expand=3228)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(mov))] // generate normal and code instead of kmovw
pub unsafe fn _mm512_kmov(a: __mmask16) -> __mmask16 {
    let r: u16 = a;
    transmute(r)
}

/// Converts integer mask into bitmask, storing the result in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_int2mask&expand=3189)
#[inline]
#[target_feature(enable = "avx512f")] // generate normal and code instead of kmovw
pub unsafe fn _mm512_int2mask(mask: i32) -> __mmask16 {
    let r: u16 = mask as u16;
    transmute(r)
}

/// Converts bit mask k1 into an integer value, storing the results in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_mask2int&expand=3544)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(mov))] // generate normal and code instead of kmovw
pub unsafe fn _mm512_mask2int(k1: __mmask16) -> i32 {
    let r: i32 = k1 as i32;
    transmute(r)
}

/// Unpack and interleave 8 bits from masks a and b, and store the 16-bit result in k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_kunpackb&expand=3280)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(mov))] // generate normal and code instead of kunpckbw
pub unsafe fn _mm512_kunpackb(a: __mmask16, b: __mmask16) -> __mmask16 {
    let a = a & 0b00000000_11111111;
    let b = b & 0b11111111_00000000;
    transmute(a | b)
}

/// Performs bitwise OR between k1 and k2, storing the result in dst. CF flag is set if dst consists of all 1's.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_kortestc&expand=3247)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(cmp))] // generate normal and code instead of kortestw
pub unsafe fn _mm512_kortestc(a: __mmask16, b: __mmask16) -> i32 {
    let r = a | b;
    if r == 0b11111111_11111111 {
        1
    } else {
        0
    }
}

/// Compute the bitwise AND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_test_epi32_mask&expand=5890)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vptestmd))]
pub unsafe fn _mm512_test_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
    let and = _mm512_and_epi32(a, b);
    let zero = _mm512_setzero_si512();
    _mm512_cmpneq_epi32_mask(and, zero)
}

/// Compute the bitwise AND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_test_epi32_mask&expand=5889)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vptestmd))]
pub unsafe fn _mm512_mask_test_epi32_mask(k: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
    let and = _mm512_and_epi32(a, b);
    let zero = _mm512_setzero_si512();
    _mm512_mask_cmpneq_epi32_mask(k, and, zero)
}

/// Compute the bitwise AND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_test_epi32_mask&expand=5888)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vptestmd))]
pub unsafe fn _mm256_test_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
    let and = _mm256_and_si256(a, b);
    let zero = _mm256_setzero_si256();
    _mm256_cmpneq_epi32_mask(and, zero)
}

/// Compute the bitwise AND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_test_epi32_mask&expand=5887)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vptestmd))]
pub unsafe fn _mm256_mask_test_epi32_mask(k: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
    let and = _mm256_and_si256(a, b);
    let zero = _mm256_setzero_si256();
    _mm256_mask_cmpneq_epi32_mask(k, and, zero)
}

/// Compute the bitwise AND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_epi32_mask&expand=5886)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vptestmd))]
pub unsafe fn _mm_test_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
    let and = _mm_and_si128(a, b);
    let zero = _mm_setzero_si128();
    _mm_cmpneq_epi32_mask(and, zero)
}

/// Compute the bitwise AND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_test_epi32_mask&expand=5885)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vptestmd))]
pub unsafe fn _mm_mask_test_epi32_mask(k: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
    let and = _mm_and_si128(a, b);
    let zero = _mm_setzero_si128();
    _mm_mask_cmpneq_epi32_mask(k, and, zero)
}

/// Compute the bitwise AND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_test_epi64_mask&expand=5896)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vptestmq))]
pub unsafe fn _mm512_test_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
    let and = _mm512_and_epi64(a, b);
    let zero = _mm512_setzero_si512();
    _mm512_cmpneq_epi64_mask(and, zero)
}

/// Compute the bitwise AND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_test_epi64_mask&expand=5895)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vptestmq))]
pub unsafe fn _mm512_mask_test_epi64_mask(k: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
    let and = _mm512_and_epi64(a, b);
    let zero = _mm512_setzero_si512();
    _mm512_mask_cmpneq_epi64_mask(k, and, zero)
}

/// Compute the bitwise AND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_test_epi64_mask&expand=5894)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vptestmq))]
pub unsafe fn _mm256_test_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
    let and = _mm256_and_si256(a, b);
    let zero = _mm256_setzero_si256();
    _mm256_cmpneq_epi64_mask(and, zero)
}

/// Compute the bitwise AND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_test_epi64_mask&expand=5893)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vptestmq))]
pub unsafe fn _mm256_mask_test_epi64_mask(k: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
    let and = _mm256_and_si256(a, b);
    let zero = _mm256_setzero_si256();
    _mm256_mask_cmpneq_epi64_mask(k, and, zero)
}

/// Compute the bitwise AND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k if the intermediate value is non-zero.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_epi64_mask&expand=5892)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vptestmq))]
pub unsafe fn _mm_test_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
    let and = _mm_and_si128(a, b);
    let zero = _mm_setzero_si128();
    _mm_cmpneq_epi64_mask(and, zero)
}

/// Compute the bitwise AND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is non-zero.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_test_epi64_mask&expand=5891)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vptestmq))]
pub unsafe fn _mm_mask_test_epi64_mask(k: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
    let and = _mm_and_si128(a, b);
    let zero = _mm_setzero_si128();
    _mm_mask_cmpneq_epi64_mask(k, and, zero)
}

/// Compute the bitwise NAND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_testn_epi32_mask&expand=5921)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vptestnmd))]
pub unsafe fn _mm512_testn_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
    let and = _mm512_and_epi32(a, b);
    let zero = _mm512_setzero_si512();
    _mm512_cmpeq_epi32_mask(and, zero)
}

/// Compute the bitwise NAND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_testn_epi32_mask&expand=5920)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vptestnmd))]
pub unsafe fn _mm512_mask_testn_epi32_mask(k: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
    let and = _mm512_and_epi32(a, b);
    let zero = _mm512_setzero_si512();
    _mm512_mask_cmpeq_epi32_mask(k, and, zero)
}

/// Compute the bitwise NAND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_testn_epi32_mask&expand=5919)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vptestnmd))]
pub unsafe fn _mm256_testn_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
    let and = _mm256_and_si256(a, b);
    let zero = _mm256_setzero_si256();
    _mm256_cmpeq_epi32_mask(and, zero)
}

/// Compute the bitwise NAND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_testn_epi32_mask&expand=5918)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vptestnmd))]
pub unsafe fn _mm256_mask_testn_epi32_mask(k: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
    let and = _mm256_and_si256(a, b);
    let zero = _mm256_setzero_si256();
    _mm256_mask_cmpeq_epi32_mask(k, and, zero)
}

/// Compute the bitwise NAND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testn_epi32_mask&expand=5917)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vptestnmd))]
pub unsafe fn _mm_testn_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
    let and = _mm_and_si128(a, b);
    let zero = _mm_setzero_si128();
    _mm_cmpeq_epi32_mask(and, zero)
}

/// Compute the bitwise NAND of packed 32-bit integers in a and b, producing intermediate 32-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_testn_epi32_mask&expand=5916)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vptestnmd))]
pub unsafe fn _mm_mask_testn_epi32_mask(k: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
    let and = _mm_and_si128(a, b);
    let zero = _mm_setzero_si128();
    _mm_mask_cmpeq_epi32_mask(k, and, zero)
}

/// Compute the bitwise NAND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_testn_epi64_mask&expand=5927)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vptestnmq))]
pub unsafe fn _mm512_testn_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
    let and = _mm512_and_epi64(a, b);
    let zero = _mm512_setzero_si512();
    _mm512_cmpeq_epi64_mask(and, zero)
}

/// Compute the bitwise NAND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_testn_epi64_mask&expand=5926)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vptestnmq))]
pub unsafe fn _mm512_mask_testn_epi64_mask(k: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
    let and = _mm512_and_epi64(a, b);
    let zero = _mm512_setzero_si512();
    _mm512_mask_cmpeq_epi64_mask(k, and, zero)
}

/// Compute the bitwise NAND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_testn_epi64_mask&expand=5925)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vptestnmq))]
pub unsafe fn _mm256_testn_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
    let and = _mm256_and_si256(a, b);
    let zero = _mm256_setzero_si256();
    _mm256_cmpeq_epi64_mask(and, zero)
}

/// Compute the bitwise NAND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_testn_epi64_mask&expand=5924)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vptestnmq))]
pub unsafe fn _mm256_mask_testn_epi64_mask(k: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
    let and = _mm256_and_si256(a, b);
    let zero = _mm256_setzero_si256();
    _mm256_mask_cmpeq_epi64_mask(k, and, zero)
}

/// Compute the bitwise NAND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k if the intermediate value is zero.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testn_epi64_mask&expand=5923)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vptestnmq))]
pub unsafe fn _mm_testn_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
    let and = _mm_and_si128(a, b);
    let zero = _mm_setzero_si128();
    _mm_cmpeq_epi64_mask(and, zero)
}

/// Compute the bitwise NAND of packed 64-bit integers in a and b, producing intermediate 64-bit values, and set the corresponding bit in result mask k (subject to writemask k) if the intermediate value is zero.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_testn_epi64_mask&expand=5922)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vptestnmq))]
pub unsafe fn _mm_mask_testn_epi64_mask(k: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
    let and = _mm_and_si128(a, b);
    let zero = _mm_setzero_si128();
    _mm_mask_cmpeq_epi64_mask(k, and, zero)
}

/// Store 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_stream_ps&expand=5671)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmovntps))]
#[allow(clippy::cast_ptr_alignment)]
pub unsafe fn _mm512_stream_ps(mem_addr: *mut f32, a: __m512) {
    intrinsics::nontemporal_store(mem_addr as *mut __m512, a);
}

/// Store 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_stream_pd&expand=5667)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmovntps))] //should be vmovntpd
#[allow(clippy::cast_ptr_alignment)]
pub unsafe fn _mm512_stream_pd(mem_addr: *mut f64, a: __m512d) {
    intrinsics::nontemporal_store(mem_addr as *mut __m512d, a);
}

/// Store 512-bits of integer data from a into memory using a non-temporal memory hint. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_stream_si512&expand=5675)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmovntps))] //should be vmovntdq
#[allow(clippy::cast_ptr_alignment)]
pub unsafe fn _mm512_stream_si512(mem_addr: *mut i64, a: __m512i) {
    intrinsics::nontemporal_store(mem_addr as *mut __m512i, a);
}

/// Sets packed 32-bit integers in `dst` with the supplied values.
///
/// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set_ps&expand=4931)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_set_ps(
    e0: f32,
    e1: f32,
    e2: f32,
    e3: f32,
    e4: f32,
    e5: f32,
    e6: f32,
    e7: f32,
    e8: f32,
    e9: f32,
    e10: f32,
    e11: f32,
    e12: f32,
    e13: f32,
    e14: f32,
    e15: f32,
) -> __m512 {
    _mm512_setr_ps(
        e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0,
    )
}

/// Sets packed 32-bit integers in `dst` with the supplied values in
/// reverse order.
///
/// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setr_ps&expand=5008)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_setr_ps(
    e0: f32,
    e1: f32,
    e2: f32,
    e3: f32,
    e4: f32,
    e5: f32,
    e6: f32,
    e7: f32,
    e8: f32,
    e9: f32,
    e10: f32,
    e11: f32,
    e12: f32,
    e13: f32,
    e14: f32,
    e15: f32,
) -> __m512 {
    let r = f32x16::new(
        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
    );
    transmute(r)
}

/// Broadcast 64-bit float `a` to all elements of `dst`.
///
/// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set1_pd&expand=4975)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_set1_pd(a: f64) -> __m512d {
    transmute(f64x8::splat(a))
}

/// Broadcast 32-bit float `a` to all elements of `dst`.
///
/// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set1_ps&expand=4981)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_set1_ps(a: f32) -> __m512 {
    transmute(f32x16::splat(a))
}

/// Sets packed 32-bit integers in `dst` with the supplied values.
///
/// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set_epi32&expand=4908)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_set_epi32(
    e15: i32,
    e14: i32,
    e13: i32,
    e12: i32,
    e11: i32,
    e10: i32,
    e9: i32,
    e8: i32,
    e7: i32,
    e6: i32,
    e5: i32,
    e4: i32,
    e3: i32,
    e2: i32,
    e1: i32,
    e0: i32,
) -> __m512i {
    _mm512_setr_epi32(
        e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15,
    )
}

/// Broadcast 8-bit integer a to all elements of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set1_epi8&expand=4972)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_set1_epi8(a: i8) -> __m512i {
    transmute(i8x64::splat(a))
}

/// Broadcast the low packed 16-bit integer from a to all all elements of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set1_epi16&expand=4944)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_set1_epi16(a: i16) -> __m512i {
    transmute(i16x32::splat(a))
}

/// Broadcast 32-bit integer `a` to all elements of `dst`.
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_set1_epi32(a: i32) -> __m512i {
    transmute(i32x16::splat(a))
}

/// Broadcast 32-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_set1_epi32&expand=4951)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpbroadcastd))]
pub unsafe fn _mm512_mask_set1_epi32(src: __m512i, k: __mmask16, a: i32) -> __m512i {
    let r = _mm512_set1_epi32(a).as_i32x16();
    transmute(simd_select_bitmask(k, r, src.as_i32x16()))
}

/// Broadcast 32-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_set1_epi32&expand=4952)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpbroadcastd))]
pub unsafe fn _mm512_maskz_set1_epi32(k: __mmask16, a: i32) -> __m512i {
    let r = _mm512_set1_epi32(a).as_i32x16();
    let zero = _mm512_setzero_si512().as_i32x16();
    transmute(simd_select_bitmask(k, r, zero))
}

/// Broadcast 32-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_set1_epi32&expand=4948)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpbroadcastd))]
pub unsafe fn _mm256_mask_set1_epi32(src: __m256i, k: __mmask8, a: i32) -> __m256i {
    let r = _mm256_set1_epi32(a).as_i32x8();
    transmute(simd_select_bitmask(k, r, src.as_i32x8()))
}

/// Broadcast 32-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_set1_epi32&expand=4949)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpbroadcastd))]
pub unsafe fn _mm256_maskz_set1_epi32(k: __mmask8, a: i32) -> __m256i {
    let r = _mm256_set1_epi32(a).as_i32x8();
    let zero = _mm256_setzero_si256().as_i32x8();
    transmute(simd_select_bitmask(k, r, zero))
}

/// Broadcast 32-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_set1_epi32&expand=4945)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpbroadcastd))]
pub unsafe fn _mm_mask_set1_epi32(src: __m128i, k: __mmask8, a: i32) -> __m128i {
    let r = _mm_set1_epi32(a).as_i32x4();
    transmute(simd_select_bitmask(k, r, src.as_i32x4()))
}

/// Broadcast 32-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_set1_epi32&expand=4946)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpbroadcastd))]
pub unsafe fn _mm_maskz_set1_epi32(k: __mmask8, a: i32) -> __m128i {
    let r = _mm_set1_epi32(a).as_i32x4();
    let zero = _mm_setzero_si128().as_i32x4();
    transmute(simd_select_bitmask(k, r, zero))
}

/// Broadcast 64-bit integer `a` to all elements of `dst`.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set1_epi64&expand=4961)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_set1_epi64(a: i64) -> __m512i {
    transmute(i64x8::splat(a))
}

/// Broadcast 64-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_set1_epi64&expand=4959)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpbroadcastq))]
pub unsafe fn _mm512_mask_set1_epi64(src: __m512i, k: __mmask8, a: i64) -> __m512i {
    let r = _mm512_set1_epi64(a).as_i64x8();
    transmute(simd_select_bitmask(k, r, src.as_i64x8()))
}

/// Broadcast 64-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_set1_epi64&expand=4960)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpbroadcastq))]
pub unsafe fn _mm512_maskz_set1_epi64(k: __mmask8, a: i64) -> __m512i {
    let r = _mm512_set1_epi64(a).as_i64x8();
    let zero = _mm512_setzero_si512().as_i64x8();
    transmute(simd_select_bitmask(k, r, zero))
}

/// Broadcast 64-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_set1_epi64&expand=4957)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpbroadcastq))]
pub unsafe fn _mm256_mask_set1_epi64(src: __m256i, k: __mmask8, a: i64) -> __m256i {
    let r = _mm256_set1_epi64x(a).as_i64x4();
    transmute(simd_select_bitmask(k, r, src.as_i64x4()))
}

/// Broadcast 64-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_set1_epi64&expand=4958)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpbroadcastq))]
pub unsafe fn _mm256_maskz_set1_epi64(k: __mmask8, a: i64) -> __m256i {
    let r = _mm256_set1_epi64x(a).as_i64x4();
    let zero = _mm256_setzero_si256().as_i64x4();
    transmute(simd_select_bitmask(k, r, zero))
}

/// Broadcast 64-bit integer a to all elements of dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_set1_epi64&expand=4954)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpbroadcastq))]
pub unsafe fn _mm_mask_set1_epi64(src: __m128i, k: __mmask8, a: i64) -> __m128i {
    let r = _mm_set1_epi64x(a).as_i64x2();
    transmute(simd_select_bitmask(k, r, src.as_i64x2()))
}

/// Broadcast 64-bit integer a to all elements of dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_set1_epi64&expand=4955)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpbroadcastq))]
pub unsafe fn _mm_maskz_set1_epi64(k: __mmask8, a: i64) -> __m128i {
    let r = _mm_set1_epi64x(a).as_i64x2();
    let zero = _mm_setzero_si128().as_i64x2();
    transmute(simd_select_bitmask(k, r, zero))
}

/// Set packed 64-bit integers in dst with the repeated 4 element sequence.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set4_epi64&expand=4983)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_set4_epi64(d: i64, c: i64, b: i64, a: i64) -> __m512i {
    let r = i64x8::new(d, c, b, a, d, c, b, a);
    transmute(r)
}

/// Set packed 64-bit integers in dst with the repeated 4 element sequence in reverse order.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setr4_epi64&expand=5010)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_setr4_epi64(d: i64, c: i64, b: i64, a: i64) -> __m512i {
    let r = i64x8::new(a, b, c, d, a, b, c, d);
    transmute(r)
}

/// Compare packed single-precision (32-bit) floating-point elements in a and b for less-than, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmplt_ps_mask&expand=1074)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
pub unsafe fn _mm512_cmplt_ps_mask(a: __m512, b: __m512) -> __mmask16 {
    _mm512_cmp_ps_mask::<_CMP_LT_OS>(a, b)
}

/// Compare packed single-precision (32-bit) floating-point elements in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmplt_ps_mask&expand=1075)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
pub unsafe fn _mm512_mask_cmplt_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
    _mm512_mask_cmp_ps_mask::<_CMP_LT_OS>(k1, a, b)
}

/// Compare packed single-precision (32-bit) floating-point elements in a and b for not-less-than, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpnlt_ps_mask&expand=1154)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
pub unsafe fn _mm512_cmpnlt_ps_mask(a: __m512, b: __m512) -> __mmask16 {
    _mm512_cmp_ps_mask::<_CMP_NLT_US>(a, b)
}

/// Compare packed single-precision (32-bit) floating-point elements in a and b for not-less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpnlt_ps_mask&expand=1155)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
pub unsafe fn _mm512_mask_cmpnlt_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
    _mm512_mask_cmp_ps_mask::<_CMP_NLT_US>(k1, a, b)
}

/// Compare packed single-precision (32-bit) floating-point elements in a and b for less-than-or-equal, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmple_ps_mask&expand=1013)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
pub unsafe fn _mm512_cmple_ps_mask(a: __m512, b: __m512) -> __mmask16 {
    _mm512_cmp_ps_mask::<_CMP_LE_OS>(a, b)
}

/// Compare packed single-precision (32-bit) floating-point elements in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmple_ps_mask&expand=1014)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
pub unsafe fn _mm512_mask_cmple_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
    _mm512_mask_cmp_ps_mask::<_CMP_LE_OS>(k1, a, b)
}

/// Compare packed single-precision (32-bit) floating-point elements in a and b for not-less-than-or-equal, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpnle_ps_mask&expand=1146)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
pub unsafe fn _mm512_cmpnle_ps_mask(a: __m512, b: __m512) -> __mmask16 {
    _mm512_cmp_ps_mask::<_CMP_NLE_US>(a, b)
}

/// Compare packed single-precision (32-bit) floating-point elements in a and b for not-less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpnle_ps_mask&expand=1147)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
pub unsafe fn _mm512_mask_cmpnle_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
    _mm512_mask_cmp_ps_mask::<_CMP_NLE_US>(k1, a, b)
}

/// Compare packed single-precision (32-bit) floating-point elements in a and b for equality, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpeq_ps_mask&expand=828)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
pub unsafe fn _mm512_cmpeq_ps_mask(a: __m512, b: __m512) -> __mmask16 {
    _mm512_cmp_ps_mask::<_CMP_EQ_OQ>(a, b)
}

/// Compare packed single-precision (32-bit) floating-point elements in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpeq_ps_mask&expand=829)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
pub unsafe fn _mm512_mask_cmpeq_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
    _mm512_mask_cmp_ps_mask::<_CMP_EQ_OQ>(k1, a, b)
}

/// Compare packed single-precision (32-bit) floating-point elements in a and b for not-equal, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpneq_ps_mask&expand=1130)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
pub unsafe fn _mm512_cmpneq_ps_mask(a: __m512, b: __m512) -> __mmask16 {
    _mm512_cmp_ps_mask::<_CMP_NEQ_UQ>(a, b)
}

/// Compare packed single-precision (32-bit) floating-point elements in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpneq_ps_mask&expand=1131)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
pub unsafe fn _mm512_mask_cmpneq_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
    _mm512_mask_cmp_ps_mask::<_CMP_NEQ_UQ>(k1, a, b)
}

/// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_ps_mask&expand=749)
#[inline]
#[target_feature(enable = "avx512f")]
#[rustc_legacy_const_generics(2)]
#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
pub unsafe fn _mm512_cmp_ps_mask<const IMM8: i32>(a: __m512, b: __m512) -> __mmask16 {
    static_assert_imm5!(IMM8);
    let neg_one = -1;
    let a = a.as_f32x16();
    let b = b.as_f32x16();
    let r = vcmpps(a, b, IMM8, neg_one, _MM_FROUND_CUR_DIRECTION);
    transmute(r)
}

/// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_ps_mask&expand=750)
#[inline]
#[target_feature(enable = "avx512f")]
#[rustc_legacy_const_generics(3)]
#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
pub unsafe fn _mm512_mask_cmp_ps_mask<const IMM8: i32>(
    k1: __mmask16,
    a: __m512,
    b: __m512,
) -> __mmask16 {
    static_assert_imm5!(IMM8);
    let a = a.as_f32x16();
    let b = b.as_f32x16();
    let r = vcmpps(a, b, IMM8, k1 as i16, _MM_FROUND_CUR_DIRECTION);
    transmute(r)
}

/// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmp_ps_mask&expand=747)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[rustc_legacy_const_generics(2)]
#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
pub unsafe fn _mm256_cmp_ps_mask<const IMM8: i32>(a: __m256, b: __m256) -> __mmask8 {
    static_assert_imm5!(IMM8);
    let neg_one = -1;
    let a = a.as_f32x8();
    let b = b.as_f32x8();
    let r = vcmpps256(a, b, IMM8, neg_one);
    transmute(r)
}

/// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmp_ps_mask&expand=748)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[rustc_legacy_const_generics(3)]
#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
pub unsafe fn _mm256_mask_cmp_ps_mask<const IMM8: i32>(
    k1: __mmask8,
    a: __m256,
    b: __m256,
) -> __mmask8 {
    static_assert_imm5!(IMM8);
    let a = a.as_f32x8();
    let b = b.as_f32x8();
    let r = vcmpps256(a, b, IMM8, k1 as i8);
    transmute(r)
}

/// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_ps_mask&expand=745)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[rustc_legacy_const_generics(2)]
#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
pub unsafe fn _mm_cmp_ps_mask<const IMM8: i32>(a: __m128, b: __m128) -> __mmask8 {
    static_assert_imm5!(IMM8);
    let neg_one = -1;
    let a = a.as_f32x4();
    let b = b.as_f32x4();
    let r = vcmpps128(a, b, IMM8, neg_one);
    transmute(r)
}

/// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_ps_mask&expand=746)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[rustc_legacy_const_generics(3)]
#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
pub unsafe fn _mm_mask_cmp_ps_mask<const IMM8: i32>(
    k1: __mmask8,
    a: __m128,
    b: __m128,
) -> __mmask8 {
    static_assert_imm5!(IMM8);
    let a = a.as_f32x4();
    let b = b.as_f32x4();
    let r = vcmpps128(a, b, IMM8, k1 as i8);
    transmute(r)
}

/// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_round_ps_mask&expand=753)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcmp, IMM5 = 0, SAE = 4))]
#[rustc_legacy_const_generics(2, 3)]
pub unsafe fn _mm512_cmp_round_ps_mask<const IMM5: i32, const SAE: i32>(
    a: __m512,
    b: __m512,
) -> __mmask16 {
    static_assert_imm5!(IMM5);
    static_assert_mantissas_sae!(SAE);
    let neg_one = -1;
    let a = a.as_f32x16();
    let b = b.as_f32x16();
    let r = vcmpps(a, b, IMM5, neg_one, SAE);
    transmute(r)
}

/// Compare packed single-precision (32-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_round_ps_mask&expand=754)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcmp, IMM5 = 0, SAE = 4))]
#[rustc_legacy_const_generics(3, 4)]
pub unsafe fn _mm512_mask_cmp_round_ps_mask<const IMM5: i32, const SAE: i32>(
    m: __mmask16,
    a: __m512,
    b: __m512,
) -> __mmask16 {
    static_assert_imm5!(IMM5);
    static_assert_mantissas_sae!(SAE);
    let a = a.as_f32x16();
    let b = b.as_f32x16();
    let r = vcmpps(a, b, IMM5, m as i16, SAE);
    transmute(r)
}

/// Compare packed single-precision (32-bit) floating-point elements in a and b to see if neither is NaN, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpord_ps_mask&expand=1162)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcmp))] //should be vcmps
pub unsafe fn _mm512_cmpord_ps_mask(a: __m512, b: __m512) -> __mmask16 {
    _mm512_cmp_ps_mask::<_CMP_ORD_Q>(a, b)
}

/// Compare packed single-precision (32-bit) floating-point elements in a and b to see if neither is NaN, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpord_ps_mask&expand=1163)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
pub unsafe fn _mm512_mask_cmpord_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
    _mm512_mask_cmp_ps_mask::<_CMP_ORD_Q>(k1, a, b)
}

/// Compare packed single-precision (32-bit) floating-point elements in a and b to see if either is NaN, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpunord_ps_mask&expand=1170)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
pub unsafe fn _mm512_cmpunord_ps_mask(a: __m512, b: __m512) -> __mmask16 {
    _mm512_cmp_ps_mask::<_CMP_UNORD_Q>(a, b)
}

/// Compare packed single-precision (32-bit) floating-point elements in a and b to see if either is NaN, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpunord_ps_mask&expand=1171)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcmp))] //should be vcmpps
pub unsafe fn _mm512_mask_cmpunord_ps_mask(k1: __mmask16, a: __m512, b: __m512) -> __mmask16 {
    _mm512_mask_cmp_ps_mask::<_CMP_UNORD_Q>(k1, a, b)
}

/// Compare packed double-precision (64-bit) floating-point elements in a and b for less-than, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmplt_pd_mask&expand=1071)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
pub unsafe fn _mm512_cmplt_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
    _mm512_cmp_pd_mask::<_CMP_LT_OS>(a, b)
}

/// Compare packed double-precision (64-bit) floating-point elements in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmplt_pd_mask&expand=1072)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
pub unsafe fn _mm512_mask_cmplt_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
    _mm512_mask_cmp_pd_mask::<_CMP_LT_OS>(k1, a, b)
}

/// Compare packed double-precision (64-bit) floating-point elements in a and b for not-less-than, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpnlt_pd_mask&expand=1151)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
pub unsafe fn _mm512_cmpnlt_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
    _mm512_cmp_pd_mask::<_CMP_NLT_US>(a, b)
}

/// Compare packed double-precision (64-bit) floating-point elements in a and b for not-less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpnlt_pd_mask&expand=1152)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
pub unsafe fn _mm512_mask_cmpnlt_pd_mask(m: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
    _mm512_mask_cmp_pd_mask::<_CMP_NLT_US>(m, a, b)
}

/// Compare packed double-precision (64-bit) floating-point elements in a and b for less-than-or-equal, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmple_pd_mask&expand=1010)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
pub unsafe fn _mm512_cmple_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
    _mm512_cmp_pd_mask::<_CMP_LE_OS>(a, b)
}

/// Compare packed double-precision (64-bit) floating-point elements in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmple_pd_mask&expand=1011)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
pub unsafe fn _mm512_mask_cmple_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
    _mm512_mask_cmp_pd_mask::<_CMP_LE_OS>(k1, a, b)
}

/// Compare packed double-precision (64-bit) floating-point elements in a and b for not-less-than-or-equal, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpnle_pd_mask&expand=1143)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
pub unsafe fn _mm512_cmpnle_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
    _mm512_cmp_pd_mask::<_CMP_NLE_US>(a, b)
}

/// Compare packed double-precision (64-bit) floating-point elements in a and b for not-less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpnle_pd_mask&expand=1144)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
pub unsafe fn _mm512_mask_cmpnle_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
    _mm512_mask_cmp_pd_mask::<_CMP_NLE_US>(k1, a, b)
}

/// Compare packed double-precision (64-bit) floating-point elements in a and b for equality, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpeq_pd_mask&expand=822)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
pub unsafe fn _mm512_cmpeq_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
    _mm512_cmp_pd_mask::<_CMP_EQ_OQ>(a, b)
}

/// Compare packed double-precision (64-bit) floating-point elements in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpeq_pd_mask&expand=823)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
pub unsafe fn _mm512_mask_cmpeq_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
    _mm512_mask_cmp_pd_mask::<_CMP_EQ_OQ>(k1, a, b)
}

/// Compare packed double-precision (64-bit) floating-point elements in a and b for not-equal, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpneq_pd_mask&expand=1127)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
pub unsafe fn _mm512_cmpneq_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
    _mm512_cmp_pd_mask::<_CMP_NEQ_UQ>(a, b)
}

/// Compare packed double-precision (64-bit) floating-point elements in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpneq_pd_mask&expand=1128)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
pub unsafe fn _mm512_mask_cmpneq_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
    _mm512_mask_cmp_pd_mask::<_CMP_NEQ_UQ>(k1, a, b)
}

/// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_pd_mask&expand=741)
#[inline]
#[target_feature(enable = "avx512f")]
#[rustc_legacy_const_generics(2)]
#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
pub unsafe fn _mm512_cmp_pd_mask<const IMM8: i32>(a: __m512d, b: __m512d) -> __mmask8 {
    static_assert_imm5!(IMM8);
    let neg_one = -1;
    let a = a.as_f64x8();
    let b = b.as_f64x8();
    let r = vcmppd(a, b, IMM8, neg_one, _MM_FROUND_CUR_DIRECTION);
    transmute(r)
}

/// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_pd_mask&expand=742)
#[inline]
#[target_feature(enable = "avx512f")]
#[rustc_legacy_const_generics(3)]
#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
pub unsafe fn _mm512_mask_cmp_pd_mask<const IMM8: i32>(
    k1: __mmask8,
    a: __m512d,
    b: __m512d,
) -> __mmask8 {
    static_assert_imm5!(IMM8);
    let a = a.as_f64x8();
    let b = b.as_f64x8();
    let r = vcmppd(a, b, IMM8, k1 as i8, _MM_FROUND_CUR_DIRECTION);
    transmute(r)
}

/// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmp_pd_mask&expand=739)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[rustc_legacy_const_generics(2)]
#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
pub unsafe fn _mm256_cmp_pd_mask<const IMM8: i32>(a: __m256d, b: __m256d) -> __mmask8 {
    static_assert_imm5!(IMM8);
    let neg_one = -1;
    let a = a.as_f64x4();
    let b = b.as_f64x4();
    let r = vcmppd256(a, b, IMM8, neg_one);
    transmute(r)
}

/// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmp_pd_mask&expand=740)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[rustc_legacy_const_generics(3)]
#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
pub unsafe fn _mm256_mask_cmp_pd_mask<const IMM8: i32>(
    k1: __mmask8,
    a: __m256d,
    b: __m256d,
) -> __mmask8 {
    static_assert_imm5!(IMM8);
    let a = a.as_f64x4();
    let b = b.as_f64x4();
    let r = vcmppd256(a, b, IMM8, k1 as i8);
    transmute(r)
}

/// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_pd_mask&expand=737)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[rustc_legacy_const_generics(2)]
#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
pub unsafe fn _mm_cmp_pd_mask<const IMM8: i32>(a: __m128d, b: __m128d) -> __mmask8 {
    static_assert_imm5!(IMM8);
    let neg_one = -1;
    let a = a.as_f64x2();
    let b = b.as_f64x2();
    let r = vcmppd128(a, b, IMM8, neg_one);
    transmute(r)
}

/// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_pd_mask&expand=738)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[rustc_legacy_const_generics(3)]
#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
pub unsafe fn _mm_mask_cmp_pd_mask<const IMM8: i32>(
    k1: __mmask8,
    a: __m128d,
    b: __m128d,
) -> __mmask8 {
    static_assert_imm5!(IMM8);
    let a = a.as_f64x2();
    let b = b.as_f64x2();
    let r = vcmppd128(a, b, IMM8, k1 as i8);
    transmute(r)
}

/// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_round_pd_mask&expand=751)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcmp, IMM5 = 0, SAE = 4))]
#[rustc_legacy_const_generics(2, 3)]
pub unsafe fn _mm512_cmp_round_pd_mask<const IMM5: i32, const SAE: i32>(
    a: __m512d,
    b: __m512d,
) -> __mmask8 {
    static_assert_imm5!(IMM5);
    static_assert_mantissas_sae!(SAE);
    let neg_one = -1;
    let a = a.as_f64x8();
    let b = b.as_f64x8();
    let r = vcmppd(a, b, IMM5, neg_one, SAE);
    transmute(r)
}

/// Compare packed double-precision (64-bit) floating-point elements in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_round_pd_mask&expand=752)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcmp, IMM5 = 0, SAE = 4))]
#[rustc_legacy_const_generics(3, 4)]
pub unsafe fn _mm512_mask_cmp_round_pd_mask<const IMM5: i32, const SAE: i32>(
    k1: __mmask8,
    a: __m512d,
    b: __m512d,
) -> __mmask8 {
    static_assert_imm5!(IMM5);
    static_assert_mantissas_sae!(SAE);
    let a = a.as_f64x8();
    let b = b.as_f64x8();
    let r = vcmppd(a, b, IMM5, k1 as i8, SAE);
    transmute(r)
}

/// Compare packed double-precision (64-bit) floating-point elements in a and b to see if neither is NaN, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpord_pd_mask&expand=1159)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
pub unsafe fn _mm512_cmpord_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
    _mm512_cmp_pd_mask::<_CMP_ORD_Q>(a, b)
}

/// Compare packed double-precision (64-bit) floating-point elements in a and b to see if neither is NaN, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpord_pd_mask&expand=1160)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
pub unsafe fn _mm512_mask_cmpord_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
    _mm512_mask_cmp_pd_mask::<_CMP_ORD_Q>(k1, a, b)
}

/// Compare packed double-precision (64-bit) floating-point elements in a and b to see if either is NaN, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpunord_pd_mask&expand=1167)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
pub unsafe fn _mm512_cmpunord_pd_mask(a: __m512d, b: __m512d) -> __mmask8 {
    _mm512_cmp_pd_mask::<_CMP_UNORD_Q>(a, b)
}

/// Compare packed double-precision (64-bit) floating-point elements in a and b to see if either is NaN, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpunord_pd_mask&expand=1168)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcmp))] //should be vcmppd
pub unsafe fn _mm512_mask_cmpunord_pd_mask(k1: __mmask8, a: __m512d, b: __m512d) -> __mmask8 {
    _mm512_mask_cmp_pd_mask::<_CMP_UNORD_Q>(k1, a, b)
}

/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_ss_mask&expand=763)
#[inline]
#[target_feature(enable = "avx512f")]
#[rustc_legacy_const_generics(2)]
#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
pub unsafe fn _mm_cmp_ss_mask<const IMM8: i32>(a: __m128, b: __m128) -> __mmask8 {
    static_assert_imm5!(IMM8);
    let neg_one = -1;
    let r = vcmpss(a, b, IMM8, neg_one, _MM_FROUND_CUR_DIRECTION);
    transmute(r)
}

/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k using zeromask k1 (the element is zeroed out when mask bit 0 is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_ss_mask&expand=764)
#[inline]
#[target_feature(enable = "avx512f")]
#[rustc_legacy_const_generics(3)]
#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
pub unsafe fn _mm_mask_cmp_ss_mask<const IMM8: i32>(
    k1: __mmask8,
    a: __m128,
    b: __m128,
) -> __mmask8 {
    static_assert_imm5!(IMM8);
    let r = vcmpss(a, b, IMM8, k1 as i8, _MM_FROUND_CUR_DIRECTION);
    transmute(r)
}

/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k.\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_round_ss_mask&expand=757)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcmp, IMM5 = 0, SAE = 4))]
#[rustc_legacy_const_generics(2, 3)]
pub unsafe fn _mm_cmp_round_ss_mask<const IMM5: i32, const SAE: i32>(
    a: __m128,
    b: __m128,
) -> __mmask8 {
    static_assert_imm5!(IMM5);
    static_assert_mantissas_sae!(SAE);
    let neg_one = -1;
    let r = vcmpss(a, b, IMM5, neg_one, SAE);
    transmute(r)
}

/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k using zeromask k1 (the element is zeroed out when mask bit 0 is not seti).\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_round_ss_mask&expand=758)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcmp, IMM5 = 0, SAE = 4))]
#[rustc_legacy_const_generics(3, 4)]
pub unsafe fn _mm_mask_cmp_round_ss_mask<const IMM5: i32, const SAE: i32>(
    k1: __mmask8,
    a: __m128,
    b: __m128,
) -> __mmask8 {
    static_assert_imm5!(IMM5);
    static_assert_mantissas_sae!(SAE);
    let r = vcmpss(a, b, IMM5, k1 as i8, SAE);
    transmute(r)
}

/// Compare the lower double-precision (64-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_sd_mask&expand=760)
#[inline]
#[target_feature(enable = "avx512f")]
#[rustc_legacy_const_generics(2)]
#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
pub unsafe fn _mm_cmp_sd_mask<const IMM8: i32>(a: __m128d, b: __m128d) -> __mmask8 {
    static_assert_imm5!(IMM8);
    let neg_one = -1;
    let r = vcmpsd(a, b, IMM8, neg_one, _MM_FROUND_CUR_DIRECTION);
    transmute(r)
}

/// Compare the lower double-precision (64-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k using zeromask k1 (the element is zeroed out when mask bit 0 is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_sd_mask&expand=761)
#[inline]
#[target_feature(enable = "avx512f")]
#[rustc_legacy_const_generics(3)]
#[cfg_attr(test, assert_instr(vcmp, IMM8 = 0))]
pub unsafe fn _mm_mask_cmp_sd_mask<const IMM8: i32>(
    k1: __mmask8,
    a: __m128d,
    b: __m128d,
) -> __mmask8 {
    static_assert_imm5!(IMM8);
    let r = vcmpsd(a, b, IMM8, k1 as i8, _MM_FROUND_CUR_DIRECTION);
    transmute(r)
}

/// Compare the lower double-precision (64-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k.\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_round_sd_mask&expand=755)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcmp, IMM5 = 0, SAE = 4))]
#[rustc_legacy_const_generics(2, 3)]
pub unsafe fn _mm_cmp_round_sd_mask<const IMM5: i32, const SAE: i32>(
    a: __m128d,
    b: __m128d,
) -> __mmask8 {
    static_assert_imm5!(IMM5);
    static_assert_mantissas_sae!(SAE);
    let neg_one = -1;
    let r = vcmpsd(a, b, IMM5, neg_one, SAE);
    transmute(r)
}

/// Compare the lower double-precision (64-bit) floating-point element in a and b based on the comparison operand specified by imm8, and store the result in mask vector k using zeromask k1 (the element is zeroed out when mask bit 0 is not set).\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_round_sd_mask&expand=756)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcmp, IMM5 = 0, SAE = 4))]
#[rustc_legacy_const_generics(3, 4)]
pub unsafe fn _mm_mask_cmp_round_sd_mask<const IMM5: i32, const SAE: i32>(
    k1: __mmask8,
    a: __m128d,
    b: __m128d,
) -> __mmask8 {
    static_assert_imm5!(IMM5);
    static_assert_mantissas_sae!(SAE);
    let r = vcmpsd(a, b, IMM5, k1 as i8, SAE);
    transmute(r)
}

/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmplt_epu32_mask&expand=1056)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
pub unsafe fn _mm512_cmplt_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
    simd_bitmask::<u32x16, _>(simd_lt(a.as_u32x16(), b.as_u32x16()))
}

/// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmplt_epu32_mask&expand=1057)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
pub unsafe fn _mm512_mask_cmplt_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
    _mm512_cmplt_epu32_mask(a, b) & k1
}

/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmplt_epu32_mask&expand=1054)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
pub unsafe fn _mm256_cmplt_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 {
    simd_bitmask::<u32x8, _>(simd_lt(a.as_u32x8(), b.as_u32x8()))
}

/// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmplt_epu32_mask&expand=1055)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
pub unsafe fn _mm256_mask_cmplt_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
    _mm256_cmplt_epu32_mask(a, b) & k1
}

/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_epu32_mask&expand=1052)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
pub unsafe fn _mm_cmplt_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 {
    simd_bitmask::<u32x4, _>(simd_lt(a.as_u32x4(), b.as_u32x4()))
}

/// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmplt_epu32_mask&expand=1053)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
pub unsafe fn _mm_mask_cmplt_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
    _mm_cmplt_epu32_mask(a, b) & k1
}

/// Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpgt_epu32_mask&expand=933)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
pub unsafe fn _mm512_cmpgt_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
    simd_bitmask::<u32x16, _>(simd_gt(a.as_u32x16(), b.as_u32x16()))
}

/// Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpgt_epu32_mask&expand=934)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
pub unsafe fn _mm512_mask_cmpgt_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
    _mm512_cmpgt_epu32_mask(a, b) & k1
}

/// Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpgt_epu32_mask&expand=931)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
pub unsafe fn _mm256_cmpgt_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 {
    simd_bitmask::<u32x8, _>(simd_gt(a.as_u32x8(), b.as_u32x8()))
}

/// Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmpgt_epu32_mask&expand=932)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
pub unsafe fn _mm256_mask_cmpgt_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
    _mm256_cmpgt_epu32_mask(a, b) & k1
}

/// Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_epu32_mask&expand=929)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
pub unsafe fn _mm_cmpgt_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 {
    simd_bitmask::<u32x4, _>(simd_gt(a.as_u32x4(), b.as_u32x4()))
}

/// Compare packed unsigned 32-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmpgt_epu32_mask&expand=930)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
pub unsafe fn _mm_mask_cmpgt_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
    _mm_cmpgt_epu32_mask(a, b) & k1
}

/// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmple_epu32_mask&expand=995)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
pub unsafe fn _mm512_cmple_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
    simd_bitmask::<u32x16, _>(simd_le(a.as_u32x16(), b.as_u32x16()))
}

/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmple_epu32_mask&expand=996)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
pub unsafe fn _mm512_mask_cmple_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
    _mm512_cmple_epu32_mask(a, b) & k1
}

/// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmple_epu32_mask&expand=993)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
pub unsafe fn _mm256_cmple_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 {
    simd_bitmask::<u32x8, _>(simd_le(a.as_u32x8(), b.as_u32x8()))
}

/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmple_epu32_mask&expand=994)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
pub unsafe fn _mm256_mask_cmple_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
    _mm256_cmple_epu32_mask(a, b) & k1
}

/// Compare packed unsigned 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_epu32_mask&expand=991)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
pub unsafe fn _mm_cmple_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 {
    simd_bitmask::<u32x4, _>(simd_le(a.as_u32x4(), b.as_u32x4()))
}

/// Compare packed unsigned 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmple_epu32_mask&expand=992)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
pub unsafe fn _mm_mask_cmple_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
    _mm_cmple_epu32_mask(a, b) & k1
}

/// Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpge_epu32_mask&expand=873)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
pub unsafe fn _mm512_cmpge_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
    simd_bitmask::<u32x16, _>(simd_ge(a.as_u32x16(), b.as_u32x16()))
}

/// Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpge_epu32_mask&expand=874)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
pub unsafe fn _mm512_mask_cmpge_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
    _mm512_cmpge_epu32_mask(a, b) & k1
}

/// Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpge_epu32_mask&expand=871)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
pub unsafe fn _mm256_cmpge_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 {
    simd_bitmask::<u32x8, _>(simd_ge(a.as_u32x8(), b.as_u32x8()))
}

/// Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmpge_epu32_mask&expand=872)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
pub unsafe fn _mm256_mask_cmpge_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
    _mm256_cmpge_epu32_mask(a, b) & k1
}

/// Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_epu32_mask&expand=869)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
pub unsafe fn _mm_cmpge_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 {
    simd_bitmask::<u32x4, _>(simd_ge(a.as_u32x4(), b.as_u32x4()))
}

/// Compare packed unsigned 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmpge_epu32_mask&expand=870)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
pub unsafe fn _mm_mask_cmpge_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
    _mm_cmpge_epu32_mask(a, b) & k1
}

/// Compare packed unsigned 32-bit integers in a and b for equality, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpeq_epu32_mask&expand=807)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
pub unsafe fn _mm512_cmpeq_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
    simd_bitmask::<u32x16, _>(simd_eq(a.as_u32x16(), b.as_u32x16()))
}

/// Compare packed unsigned 32-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpeq_epu32_mask&expand=808)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
pub unsafe fn _mm512_mask_cmpeq_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
    _mm512_cmpeq_epu32_mask(a, b) & k1
}

/// Compare packed unsigned 32-bit integers in a and b for equality, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpeq_epu32_mask&expand=805)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
pub unsafe fn _mm256_cmpeq_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 {
    simd_bitmask::<u32x8, _>(simd_eq(a.as_u32x8(), b.as_u32x8()))
}

/// Compare packed unsigned 32-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmpeq_epu32_mask&expand=806)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
pub unsafe fn _mm256_mask_cmpeq_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
    _mm256_cmpeq_epu32_mask(a, b) & k1
}

/// Compare packed unsigned 32-bit integers in a and b for equality, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_epu32_mask&expand=803)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
pub unsafe fn _mm_cmpeq_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 {
    simd_bitmask::<u32x4, _>(simd_eq(a.as_u32x4(), b.as_u32x4()))
}

/// Compare packed unsigned 32-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmpeq_epu32_mask&expand=804)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
pub unsafe fn _mm_mask_cmpeq_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
    _mm_cmpeq_epu32_mask(a, b) & k1
}

/// Compare packed unsigned 32-bit integers in a and b for not-equal, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpneq_epu32_mask&expand=1112)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
pub unsafe fn _mm512_cmpneq_epu32_mask(a: __m512i, b: __m512i) -> __mmask16 {
    simd_bitmask::<u32x16, _>(simd_ne(a.as_u32x16(), b.as_u32x16()))
}

/// Compare packed unsigned 32-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpneq_epu32_mask&expand=1113)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
pub unsafe fn _mm512_mask_cmpneq_epu32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
    _mm512_cmpneq_epu32_mask(a, b) & k1
}

/// Compare packed unsigned 32-bit integers in a and b for not-equal, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpneq_epu32_mask&expand=1110)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
pub unsafe fn _mm256_cmpneq_epu32_mask(a: __m256i, b: __m256i) -> __mmask8 {
    simd_bitmask::<u32x8, _>(simd_ne(a.as_u32x8(), b.as_u32x8()))
}

/// Compare packed unsigned 32-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmpneq_epu32_mask&expand=1111)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
pub unsafe fn _mm256_mask_cmpneq_epu32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
    _mm256_cmpneq_epu32_mask(a, b) & k1
}

/// Compare packed unsigned 32-bit integers in a and b for not-equal, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_epu32_mask&expand=1108)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
pub unsafe fn _mm_cmpneq_epu32_mask(a: __m128i, b: __m128i) -> __mmask8 {
    simd_bitmask::<u32x4, _>(simd_ne(a.as_u32x4(), b.as_u32x4()))
}

/// Compare packed unsigned 32-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmpneq_epu32_mask&expand=1109)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpud
pub unsafe fn _mm_mask_cmpneq_epu32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
    _mm_cmpneq_epu32_mask(a, b) & k1
}

/// Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_epu32_mask&expand=721)
#[inline]
#[target_feature(enable = "avx512f")]
#[rustc_legacy_const_generics(2)]
#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
pub unsafe fn _mm512_cmp_epu32_mask<const IMM3: _MM_CMPINT_ENUM>(
    a: __m512i,
    b: __m512i,
) -> __mmask16 {
    static_assert_imm3!(IMM3);
    let neg_one = -1;
    let a = a.as_i32x16();
    let b = b.as_i32x16();
    let r = vpcmpud(a, b, IMM3, neg_one);
    transmute(r)
}

/// Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_epu32_mask&expand=722)
#[inline]
#[target_feature(enable = "avx512f")]
#[rustc_legacy_const_generics(3)]
#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
pub unsafe fn _mm512_mask_cmp_epu32_mask<const IMM3: _MM_CMPINT_ENUM>(
    k1: __mmask16,
    a: __m512i,
    b: __m512i,
) -> __mmask16 {
    static_assert_imm3!(IMM3);
    let a = a.as_i32x16();
    let b = b.as_i32x16();
    let r = vpcmpud(a, b, IMM3, k1 as i16);
    transmute(r)
}

/// Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmp_epu32_mask&expand=719)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[rustc_legacy_const_generics(2)]
#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
pub unsafe fn _mm256_cmp_epu32_mask<const IMM3: _MM_CMPINT_ENUM>(
    a: __m256i,
    b: __m256i,
) -> __mmask8 {
    static_assert_imm3!(IMM3);
    let neg_one = -1;
    let a = a.as_i32x8();
    let b = b.as_i32x8();
    let r = vpcmpud256(a, b, IMM3, neg_one);
    transmute(r)
}

/// Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmp_epu32_mask&expand=720)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[rustc_legacy_const_generics(3)]
#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
pub unsafe fn _mm256_mask_cmp_epu32_mask<const IMM3: _MM_CMPINT_ENUM>(
    k1: __mmask8,
    a: __m256i,
    b: __m256i,
) -> __mmask8 {
    static_assert_imm3!(IMM3);
    let a = a.as_i32x8();
    let b = b.as_i32x8();
    let r = vpcmpud256(a, b, IMM3, k1 as i8);
    transmute(r)
}

/// Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_epu32_mask&expand=717)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[rustc_legacy_const_generics(2)]
#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
pub unsafe fn _mm_cmp_epu32_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m128i, b: __m128i) -> __mmask8 {
    static_assert_imm3!(IMM3);
    let neg_one = -1;
    let a = a.as_i32x4();
    let b = b.as_i32x4();
    let r = vpcmpud128(a, b, IMM3, neg_one);
    transmute(r)
}

/// Compare packed unsigned 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_epu32_mask&expand=718)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[rustc_legacy_const_generics(3)]
#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
pub unsafe fn _mm_mask_cmp_epu32_mask<const IMM3: _MM_CMPINT_ENUM>(
    k1: __mmask8,
    a: __m128i,
    b: __m128i,
) -> __mmask8 {
    static_assert_imm3!(IMM3);
    let a = a.as_i32x4();
    let b = b.as_i32x4();
    let r = vpcmpud128(a, b, IMM3, k1 as i8);
    transmute(r)
}

/// Compare packed signed 32-bit integers in a and b for less-than, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmplt_epi32_mask&expand=1029)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
pub unsafe fn _mm512_cmplt_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
    simd_bitmask::<i32x16, _>(simd_lt(a.as_i32x16(), b.as_i32x16()))
}

/// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmplt_epi32_mask&expand=1031)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
pub unsafe fn _mm512_mask_cmplt_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
    _mm512_cmplt_epi32_mask(a, b) & k1
}

/// Compare packed signed 32-bit integers in a and b for less-than, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmplt_epi32_mask&expand=1027)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
pub unsafe fn _mm256_cmplt_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
    simd_bitmask::<i32x8, _>(simd_lt(a.as_i32x8(), b.as_i32x8()))
}

/// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmplt_epi32_mask&expand=1028)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
pub unsafe fn _mm256_mask_cmplt_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
    _mm256_cmplt_epi32_mask(a, b) & k1
}

/// Compare packed signed 32-bit integers in a and b for less-than, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_epi32_mask&expand=1025)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
pub unsafe fn _mm_cmplt_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
    simd_bitmask::<i32x4, _>(simd_lt(a.as_i32x4(), b.as_i32x4()))
}

/// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmplt_epi32_mask&expand=1026)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
pub unsafe fn _mm_mask_cmplt_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
    _mm_cmplt_epi32_mask(a, b) & k1
}

/// Compare packed signed 32-bit integers in a and b for greater-than, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpgt_epi32_mask&expand=905)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
pub unsafe fn _mm512_cmpgt_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
    simd_bitmask::<i32x16, _>(simd_gt(a.as_i32x16(), b.as_i32x16()))
}

/// Compare packed signed 32-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpgt_epi32_mask&expand=906)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
pub unsafe fn _mm512_mask_cmpgt_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
    _mm512_cmpgt_epi32_mask(a, b) & k1
}

/// Compare packed signed 32-bit integers in a and b for greater-than, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpgt_epi32_mask&expand=903)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
pub unsafe fn _mm256_cmpgt_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
    simd_bitmask::<i32x8, _>(simd_gt(a.as_i32x8(), b.as_i32x8()))
}

/// Compare packed signed 32-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmpgt_epi32_mask&expand=904)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
pub unsafe fn _mm256_mask_cmpgt_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
    _mm256_cmpgt_epi32_mask(a, b) & k1
}

/// Compare packed signed 32-bit integers in a and b for greater-than, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_epi32_mask&expand=901)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
pub unsafe fn _mm_cmpgt_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
    simd_bitmask::<i32x4, _>(simd_gt(a.as_i32x4(), b.as_i32x4()))
}

/// Compare packed signed 32-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmpgt_epi32_mask&expand=902)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
pub unsafe fn _mm_mask_cmpgt_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
    _mm_cmpgt_epi32_mask(a, b) & k1
}

/// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmple_epi32_mask&expand=971)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
pub unsafe fn _mm512_cmple_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
    simd_bitmask::<i32x16, _>(simd_le(a.as_i32x16(), b.as_i32x16()))
}

/// Compare packed signed 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmple_epi32_mask&expand=972)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
pub unsafe fn _mm512_mask_cmple_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
    _mm512_cmple_epi32_mask(a, b) & k1
}

/// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmple_epi32_mask&expand=969)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
pub unsafe fn _mm256_cmple_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
    simd_bitmask::<i32x8, _>(simd_le(a.as_i32x8(), b.as_i32x8()))
}

/// Compare packed signed 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmple_epi32_mask&expand=970)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
pub unsafe fn _mm256_mask_cmple_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
    _mm256_cmple_epi32_mask(a, b) & k1
}

/// Compare packed signed 32-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_epi32_mask&expand=967)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
pub unsafe fn _mm_cmple_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
    simd_bitmask::<i32x4, _>(simd_le(a.as_i32x4(), b.as_i32x4()))
}

/// Compare packed signed 32-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmple_epi32_mask&expand=968)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
pub unsafe fn _mm_mask_cmple_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
    _mm_cmple_epi32_mask(a, b) & k1
}

/// Compare packed signed 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpge_epi32_mask&expand=849)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
pub unsafe fn _mm512_cmpge_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
    simd_bitmask::<i32x16, _>(simd_ge(a.as_i32x16(), b.as_i32x16()))
}

/// Compare packed signed 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpge_epi32_mask&expand=850)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
pub unsafe fn _mm512_mask_cmpge_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
    _mm512_cmpge_epi32_mask(a, b) & k1
}

/// Compare packed signed 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpge_epi32_mask&expand=847)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
pub unsafe fn _mm256_cmpge_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
    simd_bitmask::<i32x8, _>(simd_ge(a.as_i32x8(), b.as_i32x8()))
}

/// Compare packed signed 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmpge_epi32_mask&expand=848)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
pub unsafe fn _mm256_mask_cmpge_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
    _mm256_cmpge_epi32_mask(a, b) & k1
}

/// Compare packed signed 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_epi32_mask&expand=845)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
pub unsafe fn _mm_cmpge_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
    simd_bitmask::<i32x4, _>(simd_ge(a.as_i32x4(), b.as_i32x4()))
}

/// Compare packed signed 32-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmpge_epi32_mask&expand=846)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
pub unsafe fn _mm_mask_cmpge_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
    _mm_cmpge_epi32_mask(a, b) & k1
}

/// Compare packed 32-bit integers in a and b for equality, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpeq_epi32_mask&expand=779)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
pub unsafe fn _mm512_cmpeq_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
    simd_bitmask::<i32x16, _>(simd_eq(a.as_i32x16(), b.as_i32x16()))
}

/// Compare packed 32-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpeq_epi32_mask&expand=780)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
pub unsafe fn _mm512_mask_cmpeq_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
    _mm512_cmpeq_epi32_mask(a, b) & k1
}

/// Compare packed 32-bit integers in a and b for equality, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpeq_epi32_mask&expand=777)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
pub unsafe fn _mm256_cmpeq_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
    simd_bitmask::<i32x8, _>(simd_eq(a.as_i32x8(), b.as_i32x8()))
}

/// Compare packed 32-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmpeq_epi32_mask&expand=778)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
pub unsafe fn _mm256_mask_cmpeq_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
    _mm256_cmpeq_epi32_mask(a, b) & k1
}

/// Compare packed 32-bit integers in a and b for equality, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_epi32_mask&expand=775)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
pub unsafe fn _mm_cmpeq_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
    simd_bitmask::<i32x4, _>(simd_eq(a.as_i32x4(), b.as_i32x4()))
}

/// Compare packed 32-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmpeq_epi32_mask&expand=776)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
pub unsafe fn _mm_mask_cmpeq_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
    _mm_cmpeq_epi32_mask(a, b) & k1
}

/// Compare packed 32-bit integers in a and b for not-equal, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpneq_epi32_mask&expand=1088)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
pub unsafe fn _mm512_cmpneq_epi32_mask(a: __m512i, b: __m512i) -> __mmask16 {
    simd_bitmask::<i32x16, _>(simd_ne(a.as_i32x16(), b.as_i32x16()))
}

/// Compare packed 32-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpneq_epi32_mask&expand=1089)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
pub unsafe fn _mm512_mask_cmpneq_epi32_mask(k1: __mmask16, a: __m512i, b: __m512i) -> __mmask16 {
    _mm512_cmpneq_epi32_mask(a, b) & k1
}

/// Compare packed 32-bit integers in a and b for not-equal, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpneq_epi32_mask&expand=1086)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
pub unsafe fn _mm256_cmpneq_epi32_mask(a: __m256i, b: __m256i) -> __mmask8 {
    simd_bitmask::<i32x8, _>(simd_ne(a.as_i32x8(), b.as_i32x8()))
}

/// Compare packed 32-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmpneq_epi32_mask&expand=1087)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
pub unsafe fn _mm256_mask_cmpneq_epi32_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
    _mm256_cmpneq_epi32_mask(a, b) & k1
}

/// Compare packed 32-bit integers in a and b for not-equal, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_epi32_mask&expand=1084)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
pub unsafe fn _mm_cmpneq_epi32_mask(a: __m128i, b: __m128i) -> __mmask8 {
    simd_bitmask::<i32x4, _>(simd_ne(a.as_i32x4(), b.as_i32x4()))
}

/// Compare packed 32-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmpneq_epi32_mask&expand=1085)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpd
pub unsafe fn _mm_mask_cmpneq_epi32_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
    _mm_cmpneq_epi32_mask(a, b) & k1
}

/// Compare packed signed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_epi32_mask&expand=697)
#[inline]
#[target_feature(enable = "avx512f")]
#[rustc_legacy_const_generics(2)]
#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
pub unsafe fn _mm512_cmp_epi32_mask<const IMM3: _MM_CMPINT_ENUM>(
    a: __m512i,
    b: __m512i,
) -> __mmask16 {
    static_assert_imm3!(IMM3);
    let neg_one = -1;
    let a = a.as_i32x16();
    let b = b.as_i32x16();
    let r = vpcmpd(a, b, IMM3, neg_one);
    transmute(r)
}

/// Compare packed signed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_epi32_mask&expand=698)
#[inline]
#[target_feature(enable = "avx512f")]
#[rustc_legacy_const_generics(3)]
#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
pub unsafe fn _mm512_mask_cmp_epi32_mask<const IMM3: _MM_CMPINT_ENUM>(
    k1: __mmask16,
    a: __m512i,
    b: __m512i,
) -> __mmask16 {
    static_assert_imm3!(IMM3);
    let a = a.as_i32x16();
    let b = b.as_i32x16();
    let r = vpcmpd(a, b, IMM3, k1 as i16);
    transmute(r)
}

/// Compare packed signed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=#text=_mm256_cmp_epi32_mask&expand=695)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[rustc_legacy_const_generics(2)]
#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
pub unsafe fn _mm256_cmp_epi32_mask<const IMM3: _MM_CMPINT_ENUM>(
    a: __m256i,
    b: __m256i,
) -> __mmask8 {
    static_assert_imm3!(IMM3);
    let neg_one = -1;
    let a = a.as_i32x8();
    let b = b.as_i32x8();
    let r = vpcmpd256(a, b, IMM3, neg_one);
    transmute(r)
}

/// Compare packed signed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmp_epi32_mask&expand=696)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[rustc_legacy_const_generics(3)]
#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
pub unsafe fn _mm256_mask_cmp_epi32_mask<const IMM3: _MM_CMPINT_ENUM>(
    k1: __mmask8,
    a: __m256i,
    b: __m256i,
) -> __mmask8 {
    static_assert_imm3!(IMM3);
    let a = a.as_i32x8();
    let b = b.as_i32x8();
    let r = vpcmpd256(a, b, IMM3, k1 as i8);
    transmute(r)
}

/// Compare packed signed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_epi32_mask&expand=693)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[rustc_legacy_const_generics(2)]
#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
pub unsafe fn _mm_cmp_epi32_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m128i, b: __m128i) -> __mmask8 {
    static_assert_imm3!(IMM3);
    let neg_one = -1;
    let a = a.as_i32x4();
    let b = b.as_i32x4();
    let r = vpcmpd128(a, b, IMM3, neg_one);
    transmute(r)
}

/// Compare packed signed 32-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_epi32_mask&expand=694)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[rustc_legacy_const_generics(3)]
#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
pub unsafe fn _mm_mask_cmp_epi32_mask<const IMM3: _MM_CMPINT_ENUM>(
    k1: __mmask8,
    a: __m128i,
    b: __m128i,
) -> __mmask8 {
    static_assert_imm3!(IMM3);
    let a = a.as_i32x4();
    let b = b.as_i32x4();
    let r = vpcmpd128(a, b, IMM3, k1 as i8);
    transmute(r)
}

/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmplt_epu64_mask&expand=1062)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
pub unsafe fn _mm512_cmplt_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
    simd_bitmask::<__m512i, _>(simd_lt(a.as_u64x8(), b.as_u64x8()))
}

/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmplt_epu64_mask&expand=1063)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
pub unsafe fn _mm512_mask_cmplt_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
    _mm512_cmplt_epu64_mask(a, b) & k1
}

/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmplt_epu64_mask&expand=1060)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
pub unsafe fn _mm256_cmplt_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 {
    simd_bitmask::<__m256i, _>(simd_lt(a.as_u64x4(), b.as_u64x4()))
}

/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmplt_epu64_mask&expand=1061)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
pub unsafe fn _mm256_mask_cmplt_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
    _mm256_cmplt_epu64_mask(a, b) & k1
}

/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_epu64_mask&expand=1058)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
pub unsafe fn _mm_cmplt_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 {
    simd_bitmask::<__m128i, _>(simd_lt(a.as_u64x2(), b.as_u64x2()))
}

/// Compare packed unsigned 64-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmplt_epu64_mask&expand=1059)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
pub unsafe fn _mm_mask_cmplt_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
    _mm_cmplt_epu64_mask(a, b) & k1
}

/// Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpgt_epu64_mask&expand=939)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
pub unsafe fn _mm512_cmpgt_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
    simd_bitmask::<__m512i, _>(simd_gt(a.as_u64x8(), b.as_u64x8()))
}

/// Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpgt_epu64_mask&expand=940)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
pub unsafe fn _mm512_mask_cmpgt_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
    _mm512_cmpgt_epu64_mask(a, b) & k1
}

/// Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpgt_epu64_mask&expand=937)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
pub unsafe fn _mm256_cmpgt_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 {
    simd_bitmask::<__m256i, _>(simd_gt(a.as_u64x4(), b.as_u64x4()))
}

/// Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmpgt_epu64_mask&expand=938)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
pub unsafe fn _mm256_mask_cmpgt_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
    _mm256_cmpgt_epu64_mask(a, b) & k1
}

/// Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_epu64_mask&expand=935)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
pub unsafe fn _mm_cmpgt_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 {
    simd_bitmask::<__m128i, _>(simd_gt(a.as_u64x2(), b.as_u64x2()))
}

/// Compare packed unsigned 64-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmpgt_epu64_mask&expand=936)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
pub unsafe fn _mm_mask_cmpgt_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
    _mm_cmpgt_epu64_mask(a, b) & k1
}

/// Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmple_epu64_mask&expand=1001)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
pub unsafe fn _mm512_cmple_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
    simd_bitmask::<__m512i, _>(simd_le(a.as_u64x8(), b.as_u64x8()))
}

/// Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmple_epu64_mask&expand=1002)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
pub unsafe fn _mm512_mask_cmple_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
    _mm512_cmple_epu64_mask(a, b) & k1
}

/// Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmple_epu64_mask&expand=999)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
pub unsafe fn _mm256_cmple_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 {
    simd_bitmask::<__m256i, _>(simd_le(a.as_u64x4(), b.as_u64x4()))
}

/// Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmple_epu64_mask&expand=1000)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
pub unsafe fn _mm256_mask_cmple_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
    _mm256_cmple_epu64_mask(a, b) & k1
}

/// Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_epu64_mask&expand=997)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
pub unsafe fn _mm_cmple_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 {
    simd_bitmask::<__m128i, _>(simd_le(a.as_u64x2(), b.as_u64x2()))
}

/// Compare packed unsigned 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmple_epu64_mask&expand=998)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
pub unsafe fn _mm_mask_cmple_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
    _mm_cmple_epu64_mask(a, b) & k1
}

/// Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpge_epu64_mask&expand=879)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
pub unsafe fn _mm512_cmpge_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
    simd_bitmask::<__m512i, _>(simd_ge(a.as_u64x8(), b.as_u64x8()))
}

/// Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpge_epu64_mask&expand=880)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
pub unsafe fn _mm512_mask_cmpge_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
    _mm512_cmpge_epu64_mask(a, b) & k1
}

/// Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpge_epu64_mask&expand=877)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
pub unsafe fn _mm256_cmpge_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 {
    simd_bitmask::<__m256i, _>(simd_ge(a.as_u64x4(), b.as_u64x4()))
}

/// Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmpge_epu64_mask&expand=878)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
pub unsafe fn _mm256_mask_cmpge_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
    _mm256_cmpge_epu64_mask(a, b) & k1
}

/// Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_epu64_mask&expand=875)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
pub unsafe fn _mm_cmpge_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 {
    simd_bitmask::<__m128i, _>(simd_ge(a.as_u64x2(), b.as_u64x2()))
}

/// Compare packed unsigned 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmpge_epu64_mask&expand=876)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
pub unsafe fn _mm_mask_cmpge_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
    _mm_cmpge_epu64_mask(a, b) & k1
}

/// Compare packed unsigned 64-bit integers in a and b for equality, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpeq_epu64_mask&expand=813)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
pub unsafe fn _mm512_cmpeq_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
    simd_bitmask::<__m512i, _>(simd_eq(a.as_u64x8(), b.as_u64x8()))
}

/// Compare packed unsigned 64-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpeq_epu64_mask&expand=814)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
pub unsafe fn _mm512_mask_cmpeq_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
    _mm512_cmpeq_epu64_mask(a, b) & k1
}

/// Compare packed unsigned 64-bit integers in a and b for equality, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpeq_epu64_mask&expand=811)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
pub unsafe fn _mm256_cmpeq_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 {
    simd_bitmask::<__m256i, _>(simd_eq(a.as_u64x4(), b.as_u64x4()))
}

/// Compare packed unsigned 64-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmpeq_epu64_mask&expand=812)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
pub unsafe fn _mm256_mask_cmpeq_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
    _mm256_cmpeq_epu64_mask(a, b) & k1
}

/// Compare packed unsigned 64-bit integers in a and b for equality, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_epu64_mask&expand=809)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
pub unsafe fn _mm_cmpeq_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 {
    simd_bitmask::<__m128i, _>(simd_eq(a.as_u64x2(), b.as_u64x2()))
}

/// Compare packed unsigned 64-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmpeq_epu64_mask&expand=810)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
pub unsafe fn _mm_mask_cmpeq_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
    _mm_cmpeq_epu64_mask(a, b) & k1
}

/// Compare packed unsigned 64-bit integers in a and b for not-equal, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpneq_epu64_mask&expand=1118)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
pub unsafe fn _mm512_cmpneq_epu64_mask(a: __m512i, b: __m512i) -> __mmask8 {
    simd_bitmask::<__m512i, _>(simd_ne(a.as_u64x8(), b.as_u64x8()))
}

/// Compare packed unsigned 64-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpneq_epu64_mask&expand=1119)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
pub unsafe fn _mm512_mask_cmpneq_epu64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
    _mm512_cmpneq_epu64_mask(a, b) & k1
}

/// Compare packed unsigned 64-bit integers in a and b for not-equal, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpneq_epu64_mask&expand=1116)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
pub unsafe fn _mm256_cmpneq_epu64_mask(a: __m256i, b: __m256i) -> __mmask8 {
    simd_bitmask::<__m256i, _>(simd_ne(a.as_u64x4(), b.as_u64x4()))
}

/// Compare packed unsigned 64-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmpneq_epu64_mask&expand=1117)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
pub unsafe fn _mm256_mask_cmpneq_epu64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
    _mm256_cmpneq_epu64_mask(a, b) & k1
}

/// Compare packed unsigned 64-bit integers in a and b for not-equal, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_epu64_mask&expand=1114)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
pub unsafe fn _mm_cmpneq_epu64_mask(a: __m128i, b: __m128i) -> __mmask8 {
    simd_bitmask::<__m128i, _>(simd_ne(a.as_u64x2(), b.as_u64x2()))
}

/// Compare packed unsigned 64-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmpneq_epu64_mask&expand=1115)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpuq
pub unsafe fn _mm_mask_cmpneq_epu64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
    _mm_cmpneq_epu64_mask(a, b) & k1
}

/// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_epu64_mask&expand=727)
#[inline]
#[target_feature(enable = "avx512f")]
#[rustc_legacy_const_generics(2)]
#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
pub unsafe fn _mm512_cmp_epu64_mask<const IMM3: _MM_CMPINT_ENUM>(
    a: __m512i,
    b: __m512i,
) -> __mmask8 {
    static_assert_imm3!(IMM3);
    let neg_one = -1;
    let a = a.as_i64x8();
    let b = b.as_i64x8();
    let r = vpcmpuq(a, b, IMM3, neg_one);
    transmute(r)
}

/// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_epu64_mask&expand=728)
#[inline]
#[target_feature(enable = "avx512f")]
#[rustc_legacy_const_generics(3)]
#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
pub unsafe fn _mm512_mask_cmp_epu64_mask<const IMM3: _MM_CMPINT_ENUM>(
    k1: __mmask8,
    a: __m512i,
    b: __m512i,
) -> __mmask8 {
    static_assert_imm3!(IMM3);
    let a = a.as_i64x8();
    let b = b.as_i64x8();
    let r = vpcmpuq(a, b, IMM3, k1 as i8);
    transmute(r)
}

/// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmp_epu64_mask&expand=725)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[rustc_legacy_const_generics(2)]
#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
pub unsafe fn _mm256_cmp_epu64_mask<const IMM3: _MM_CMPINT_ENUM>(
    a: __m256i,
    b: __m256i,
) -> __mmask8 {
    static_assert_imm3!(IMM3);
    let neg_one = -1;
    let a = a.as_i64x4();
    let b = b.as_i64x4();
    let r = vpcmpuq256(a, b, IMM3, neg_one);
    transmute(r)
}

/// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmp_epu64_mask&expand=726)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[rustc_legacy_const_generics(3)]
#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
pub unsafe fn _mm256_mask_cmp_epu64_mask<const IMM3: _MM_CMPINT_ENUM>(
    k1: __mmask8,
    a: __m256i,
    b: __m256i,
) -> __mmask8 {
    static_assert_imm3!(IMM3);
    let a = a.as_i64x4();
    let b = b.as_i64x4();
    let r = vpcmpuq256(a, b, IMM3, k1 as i8);
    transmute(r)
}

/// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_epu64_mask&expand=723)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[rustc_legacy_const_generics(2)]
#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
pub unsafe fn _mm_cmp_epu64_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m128i, b: __m128i) -> __mmask8 {
    static_assert_imm3!(IMM3);
    let neg_one = -1;
    let a = a.as_i64x2();
    let b = b.as_i64x2();
    let r = vpcmpuq128(a, b, IMM3, neg_one);
    transmute(r)
}

/// Compare packed unsigned 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_epu64_mask&expand=724)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[rustc_legacy_const_generics(3)]
#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
pub unsafe fn _mm_mask_cmp_epu64_mask<const IMM3: _MM_CMPINT_ENUM>(
    k1: __mmask8,
    a: __m128i,
    b: __m128i,
) -> __mmask8 {
    static_assert_imm3!(IMM3);
    let a = a.as_i64x2();
    let b = b.as_i64x2();
    let r = vpcmpuq128(a, b, IMM3, k1 as i8);
    transmute(r)
}

/// Compare packed signed 64-bit integers in a and b for less-than, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmplt_epi64_mask&expand=1037)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
pub unsafe fn _mm512_cmplt_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
    simd_bitmask::<__m512i, _>(simd_lt(a.as_i64x8(), b.as_i64x8()))
}

/// Compare packed signed 64-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmplt_epi64_mask&expand=1038)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
pub unsafe fn _mm512_mask_cmplt_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
    _mm512_cmplt_epi64_mask(a, b) & k1
}

/// Compare packed signed 64-bit integers in a and b for less-than, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmplt_epi64_mask&expand=1035)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
pub unsafe fn _mm256_cmplt_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
    simd_bitmask::<__m256i, _>(simd_lt(a.as_i64x4(), b.as_i64x4()))
}

/// Compare packed signed 64-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmplt_epi64_mask&expand=1036)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
pub unsafe fn _mm256_mask_cmplt_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
    _mm256_cmplt_epi64_mask(a, b) & k1
}

/// Compare packed signed 64-bit integers in a and b for less-than, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_epi64_mask&expand=1033)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
pub unsafe fn _mm_cmplt_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
    simd_bitmask::<__m128i, _>(simd_lt(a.as_i64x2(), b.as_i64x2()))
}

/// Compare packed signed 64-bit integers in a and b for less-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmplt_epi64_mask&expand=1034)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
pub unsafe fn _mm_mask_cmplt_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
    _mm_cmplt_epi64_mask(a, b) & k1
}

/// Compare packed signed 64-bit integers in a and b for greater-than, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpgt_epi64_mask&expand=913)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
pub unsafe fn _mm512_cmpgt_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
    simd_bitmask::<__m512i, _>(simd_gt(a.as_i64x8(), b.as_i64x8()))
}

/// Compare packed signed 64-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpgt_epi64_mask&expand=914)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
pub unsafe fn _mm512_mask_cmpgt_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
    _mm512_cmpgt_epi64_mask(a, b) & k1
}

/// Compare packed signed 64-bit integers in a and b for greater-than, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpgt_epi64_mask&expand=911)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
pub unsafe fn _mm256_cmpgt_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
    simd_bitmask::<__m256i, _>(simd_gt(a.as_i64x4(), b.as_i64x4()))
}

/// Compare packed signed 64-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmpgt_epi64_mask&expand=912)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
pub unsafe fn _mm256_mask_cmpgt_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
    _mm256_cmpgt_epi64_mask(a, b) & k1
}

/// Compare packed signed 64-bit integers in a and b for greater-than, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_epi64_mask&expand=909)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
pub unsafe fn _mm_cmpgt_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
    simd_bitmask::<__m128i, _>(simd_gt(a.as_i64x2(), b.as_i64x2()))
}

/// Compare packed signed 64-bit integers in a and b for greater-than, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmpgt_epi64_mask&expand=910)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
pub unsafe fn _mm_mask_cmpgt_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
    _mm_cmpgt_epi64_mask(a, b) & k1
}

/// Compare packed signed 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmple_epi64_mask&expand=977)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
pub unsafe fn _mm512_cmple_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
    simd_bitmask::<__m512i, _>(simd_le(a.as_i64x8(), b.as_i64x8()))
}

/// Compare packed signed 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmple_epi64_mask&expand=978)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
pub unsafe fn _mm512_mask_cmple_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
    _mm512_cmple_epi64_mask(a, b) & k1
}

/// Compare packed signed 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmple_epi64_mask&expand=975)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
pub unsafe fn _mm256_cmple_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
    simd_bitmask::<__m256i, _>(simd_le(a.as_i64x4(), b.as_i64x4()))
}

/// Compare packed signed 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmple_epi64_mask&expand=976)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
pub unsafe fn _mm256_mask_cmple_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
    _mm256_cmple_epi64_mask(a, b) & k1
}

/// Compare packed signed 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_epi64_mask&expand=973)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
pub unsafe fn _mm_cmple_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
    simd_bitmask::<__m128i, _>(simd_le(a.as_i64x2(), b.as_i64x2()))
}

/// Compare packed signed 64-bit integers in a and b for less-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmple_epi64_mask&expand=974)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
pub unsafe fn _mm_mask_cmple_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
    _mm_cmple_epi64_mask(a, b) & k1
}

/// Compare packed signed 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpge_epi64_mask&expand=855)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
pub unsafe fn _mm512_cmpge_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
    simd_bitmask::<__m512i, _>(simd_ge(a.as_i64x8(), b.as_i64x8()))
}

/// Compare packed signed 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpge_epi64_mask&expand=856)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
pub unsafe fn _mm512_mask_cmpge_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
    _mm512_cmpge_epi64_mask(a, b) & k1
}

/// Compare packed signed 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpge_epi64_mask&expand=853)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
pub unsafe fn _mm256_cmpge_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
    simd_bitmask::<__m256i, _>(simd_ge(a.as_i64x4(), b.as_i64x4()))
}

/// Compare packed signed 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmpge_epi64_mask&expand=854)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
pub unsafe fn _mm256_mask_cmpge_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
    _mm256_cmpge_epi64_mask(a, b) & k1
}

/// Compare packed signed 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_epi64_mask&expand=851)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
pub unsafe fn _mm_cmpge_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
    simd_bitmask::<__m128i, _>(simd_ge(a.as_i64x2(), b.as_i64x2()))
}

/// Compare packed signed 64-bit integers in a and b for greater-than-or-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmpge_epi64_mask&expand=852)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
pub unsafe fn _mm_mask_cmpge_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
    _mm_cmpge_epi64_mask(a, b) & k1
}

/// Compare packed 64-bit integers in a and b for equality, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpeq_epi64_mask&expand=787)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
pub unsafe fn _mm512_cmpeq_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
    simd_bitmask::<__m512i, _>(simd_eq(a.as_i64x8(), b.as_i64x8()))
}

/// Compare packed 64-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpeq_epi64_mask&expand=788)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
pub unsafe fn _mm512_mask_cmpeq_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
    _mm512_cmpeq_epi64_mask(a, b) & k1
}

/// Compare packed 64-bit integers in a and b for equality, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpeq_epi64_mask&expand=785)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
pub unsafe fn _mm256_cmpeq_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
    simd_bitmask::<__m256i, _>(simd_eq(a.as_i64x4(), b.as_i64x4()))
}

/// Compare packed 64-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmpeq_epi64_mask&expand=786)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
pub unsafe fn _mm256_mask_cmpeq_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
    _mm256_cmpeq_epi64_mask(a, b) & k1
}

/// Compare packed 64-bit integers in a and b for equality, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_epi64_mask&expand=783)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
pub unsafe fn _mm_cmpeq_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
    simd_bitmask::<__m128i, _>(simd_eq(a.as_i64x2(), b.as_i64x2()))
}

/// Compare packed 64-bit integers in a and b for equality, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmpeq_epi64_mask&expand=784)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
pub unsafe fn _mm_mask_cmpeq_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
    _mm_cmpeq_epi64_mask(a, b) & k1
}

/// Compare packed signed 64-bit integers in a and b for not-equal, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmpneq_epi64_mask&expand=1094)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
pub unsafe fn _mm512_cmpneq_epi64_mask(a: __m512i, b: __m512i) -> __mmask8 {
    simd_bitmask::<__m512i, _>(simd_ne(a.as_i64x8(), b.as_i64x8()))
}

/// Compare packed signed 64-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmpneq_epi64_mask&expand=1095)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
pub unsafe fn _mm512_mask_cmpneq_epi64_mask(k1: __mmask8, a: __m512i, b: __m512i) -> __mmask8 {
    _mm512_cmpneq_epi64_mask(a, b) & k1
}

/// Compare packed signed 64-bit integers in a and b for not-equal, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmpneq_epi64_mask&expand=1092)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
pub unsafe fn _mm256_cmpneq_epi64_mask(a: __m256i, b: __m256i) -> __mmask8 {
    simd_bitmask::<__m256i, _>(simd_ne(a.as_i64x4(), b.as_i64x4()))
}

/// Compare packed signed 64-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmpneq_epi64_mask&expand=1093)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
pub unsafe fn _mm256_mask_cmpneq_epi64_mask(k1: __mmask8, a: __m256i, b: __m256i) -> __mmask8 {
    _mm256_cmpneq_epi64_mask(a, b) & k1
}

/// Compare packed signed 64-bit integers in a and b for not-equal, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_epi64_mask&expand=1090)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
pub unsafe fn _mm_cmpneq_epi64_mask(a: __m128i, b: __m128i) -> __mmask8 {
    simd_bitmask::<__m128i, _>(simd_ne(a.as_i64x2(), b.as_i64x2()))
}

/// Compare packed signed 64-bit integers in a and b for not-equal, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmpneq_epi64_mask&expand=1091)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpcmp))] //should be vpcmpq
pub unsafe fn _mm_mask_cmpneq_epi64_mask(k1: __mmask8, a: __m128i, b: __m128i) -> __mmask8 {
    _mm_cmpneq_epi64_mask(a, b) & k1
}

/// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_cmp_epi64_mask&expand=703)
#[inline]
#[target_feature(enable = "avx512f")]
#[rustc_legacy_const_generics(2)]
#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
pub unsafe fn _mm512_cmp_epi64_mask<const IMM3: _MM_CMPINT_ENUM>(
    a: __m512i,
    b: __m512i,
) -> __mmask8 {
    static_assert_imm3!(IMM3);
    let neg_one = -1;
    let a = a.as_i64x8();
    let b = b.as_i64x8();
    let r = vpcmpq(a, b, IMM3, neg_one);
    transmute(r)
}

/// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cmp_epi64_mask&expand=704)
#[inline]
#[target_feature(enable = "avx512f")]
#[rustc_legacy_const_generics(3)]
#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
pub unsafe fn _mm512_mask_cmp_epi64_mask<const IMM3: _MM_CMPINT_ENUM>(
    k1: __mmask8,
    a: __m512i,
    b: __m512i,
) -> __mmask8 {
    static_assert_imm3!(IMM3);
    let a = a.as_i64x8();
    let b = b.as_i64x8();
    let r = vpcmpq(a, b, IMM3, k1 as i8);
    transmute(r)
}

/// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cmp_epi64_mask&expand=701)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[rustc_legacy_const_generics(2)]
#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
pub unsafe fn _mm256_cmp_epi64_mask<const IMM3: _MM_CMPINT_ENUM>(
    a: __m256i,
    b: __m256i,
) -> __mmask8 {
    static_assert_imm3!(IMM3);
    let neg_one = -1;
    let a = a.as_i64x4();
    let b = b.as_i64x4();
    let r = vpcmpq256(a, b, IMM3, neg_one);
    transmute(r)
}

/// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cmp_epi64_mask&expand=702)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[rustc_legacy_const_generics(3)]
#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
pub unsafe fn _mm256_mask_cmp_epi64_mask<const IMM3: _MM_CMPINT_ENUM>(
    k1: __mmask8,
    a: __m256i,
    b: __m256i,
) -> __mmask8 {
    static_assert_imm3!(IMM3);
    let a = a.as_i64x4();
    let b = b.as_i64x4();
    let r = vpcmpq256(a, b, IMM3, k1 as i8);
    transmute(r)
}

/// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmp_epi64_mask&expand=699)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[rustc_legacy_const_generics(2)]
#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
pub unsafe fn _mm_cmp_epi64_mask<const IMM3: _MM_CMPINT_ENUM>(a: __m128i, b: __m128i) -> __mmask8 {
    static_assert_imm3!(IMM3);
    let neg_one = -1;
    let a = a.as_i64x2();
    let b = b.as_i64x2();
    let r = vpcmpq128(a, b, IMM3, neg_one);
    transmute(r)
}

/// Compare packed signed 64-bit integers in a and b based on the comparison operand specified by imm8, and store the results in mask vector k using zeromask k1 (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cmp_epi64_mask&expand=700)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[rustc_legacy_const_generics(3)]
#[cfg_attr(test, assert_instr(vpcmp, IMM3 = 0))]
pub unsafe fn _mm_mask_cmp_epi64_mask<const IMM3: _MM_CMPINT_ENUM>(
    k1: __mmask8,
    a: __m128i,
    b: __m128i,
) -> __mmask8 {
    static_assert_imm3!(IMM3);
    let a = a.as_i64x2();
    let b = b.as_i64x2();
    let r = vpcmpq128(a, b, IMM3, k1 as i8);
    transmute(r)
}

/// Reduce the packed 32-bit integers in a by addition. Returns the sum of all elements in a.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_add_epi32&expand=4556)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_reduce_add_epi32(a: __m512i) -> i32 {
    simd_reduce_add_unordered(a.as_i32x16())
}

/// Reduce the packed 32-bit integers in a by addition using mask k. Returns the sum of all active elements in a.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_add_epi32&expand=4555)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_mask_reduce_add_epi32(k: __mmask16, a: __m512i) -> i32 {
    simd_reduce_add_unordered(simd_select_bitmask(
        k,
        a.as_i32x16(),
        _mm512_setzero_si512().as_i32x16(),
    ))
}

/// Reduce the packed 64-bit integers in a by addition. Returns the sum of all elements in a.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_add_epi64&expand=4558)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_reduce_add_epi64(a: __m512i) -> i64 {
    simd_reduce_add_unordered(a.as_i64x8())
}

/// Reduce the packed 64-bit integers in a by addition using mask k. Returns the sum of all active elements in a.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_add_epi64&expand=4557)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_mask_reduce_add_epi64(k: __mmask8, a: __m512i) -> i64 {
    simd_reduce_add_unordered(simd_select_bitmask(
        k,
        a.as_i64x8(),
        _mm512_setzero_si512().as_i64x8(),
    ))
}

/// Reduce the packed single-precision (32-bit) floating-point elements in a by addition. Returns the sum of all elements in a.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_add_ps&expand=4562)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_reduce_add_ps(a: __m512) -> f32 {
    simd_reduce_add_unordered(a.as_f32x16())
}

/// Reduce the packed single-precision (32-bit) floating-point elements in a by addition using mask k. Returns the sum of all active elements in a.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_add_ps&expand=4561)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_mask_reduce_add_ps(k: __mmask16, a: __m512) -> f32 {
    simd_reduce_add_unordered(simd_select_bitmask(
        k,
        a.as_f32x16(),
        _mm512_setzero_ps().as_f32x16(),
    ))
}

/// Reduce the packed double-precision (64-bit) floating-point elements in a by addition. Returns the sum of all elements in a.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_add_pd&expand=4560)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_reduce_add_pd(a: __m512d) -> f64 {
    simd_reduce_add_unordered(a.as_f64x8())
}

/// Reduce the packed double-precision (64-bit) floating-point elements in a by addition using mask k. Returns the sum of all active elements in a.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_add_pd&expand=4559)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_mask_reduce_add_pd(k: __mmask8, a: __m512d) -> f64 {
    simd_reduce_add_unordered(simd_select_bitmask(
        k,
        a.as_f64x8(),
        _mm512_setzero_pd().as_f64x8(),
    ))
}

/// Reduce the packed 32-bit integers in a by multiplication. Returns the product of all elements in a.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_mul_epi32&expand=4600)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_reduce_mul_epi32(a: __m512i) -> i32 {
    simd_reduce_mul_unordered(a.as_i32x16())
}

/// Reduce the packed 32-bit integers in a by multiplication using mask k. Returns the product of all active elements in a.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_mul_epi32&expand=4599)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_mask_reduce_mul_epi32(k: __mmask16, a: __m512i) -> i32 {
    simd_reduce_mul_unordered(simd_select_bitmask(
        k,
        a.as_i32x16(),
        _mm512_set1_epi32(1).as_i32x16(),
    ))
}

/// Reduce the packed 64-bit integers in a by multiplication. Returns the product of all elements in a.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_mul_epi64&expand=4602)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_reduce_mul_epi64(a: __m512i) -> i64 {
    simd_reduce_mul_unordered(a.as_i64x8())
}

/// Reduce the packed 64-bit integers in a by multiplication using mask k. Returns the product of all active elements in a.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_mul_epi64&expand=4601)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_mask_reduce_mul_epi64(k: __mmask8, a: __m512i) -> i64 {
    simd_reduce_mul_unordered(simd_select_bitmask(
        k,
        a.as_i64x8(),
        _mm512_set1_epi64(1).as_i64x8(),
    ))
}

/// Reduce the packed single-precision (32-bit) floating-point elements in a by multiplication. Returns the product of all elements in a.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_mul_ps&expand=4606)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_reduce_mul_ps(a: __m512) -> f32 {
    simd_reduce_mul_unordered(a.as_f32x16())
}

/// Reduce the packed single-precision (32-bit) floating-point elements in a by multiplication using mask k. Returns the product of all active elements in a.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_mul_ps&expand=4605)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_mask_reduce_mul_ps(k: __mmask16, a: __m512) -> f32 {
    simd_reduce_mul_unordered(simd_select_bitmask(
        k,
        a.as_f32x16(),
        _mm512_set1_ps(1.).as_f32x16(),
    ))
}

/// Reduce the packed double-precision (64-bit) floating-point elements in a by multiplication. Returns the product of all elements in a.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_mul_pd&expand=4604)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_reduce_mul_pd(a: __m512d) -> f64 {
    simd_reduce_mul_unordered(a.as_f64x8())
}

/// Reduce the packed double-precision (64-bit) floating-point elements in a by multiplication using mask k. Returns the product of all active elements in a.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_mul_pd&expand=4603)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_mask_reduce_mul_pd(k: __mmask8, a: __m512d) -> f64 {
    simd_reduce_mul_unordered(simd_select_bitmask(
        k,
        a.as_f64x8(),
        _mm512_set1_pd(1.).as_f64x8(),
    ))
}

/// Reduce the packed signed 32-bit integers in a by maximum. Returns the maximum of all elements in a.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_max_epi32&expand=4576)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_reduce_max_epi32(a: __m512i) -> i32 {
    simd_reduce_max(a.as_i32x16())
}

/// Reduce the packed signed 32-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_max_epi32&expand=4575)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_mask_reduce_max_epi32(k: __mmask16, a: __m512i) -> i32 {
    simd_reduce_max(simd_select_bitmask(
        k,
        a.as_i32x16(),
        _mm512_undefined_epi32().as_i32x16(),
    ))
}

/// Reduce the packed signed 64-bit integers in a by maximum. Returns the maximum of all elements in a.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_max_epi64&expand=4578)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_reduce_max_epi64(a: __m512i) -> i64 {
    simd_reduce_max(a.as_i64x8())
}

/// Reduce the packed signed 64-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_max_epi64&expand=4577)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_mask_reduce_max_epi64(k: __mmask8, a: __m512i) -> i64 {
    simd_reduce_max(simd_select_bitmask(
        k,
        a.as_i64x8(),
        _mm512_set1_epi64(0).as_i64x8(),
    ))
}

/// Reduce the packed unsigned 32-bit integers in a by maximum. Returns the maximum of all elements in a.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_max_epu32&expand=4580)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_reduce_max_epu32(a: __m512i) -> u32 {
    simd_reduce_max(a.as_u32x16())
}

/// Reduce the packed unsigned 32-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_max_epu32&expand=4579)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_mask_reduce_max_epu32(k: __mmask16, a: __m512i) -> u32 {
    simd_reduce_max(simd_select_bitmask(
        k,
        a.as_u32x16(),
        _mm512_undefined_epi32().as_u32x16(),
    ))
}

/// Reduce the packed unsigned 64-bit integers in a by maximum. Returns the maximum of all elements in a.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_max_epu64&expand=4582)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_reduce_max_epu64(a: __m512i) -> u64 {
    simd_reduce_max(a.as_u64x8())
}

/// Reduce the packed unsigned 64-bit integers in a by maximum using mask k. Returns the maximum of all active elements in a.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_max_epu64&expand=4581)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_mask_reduce_max_epu64(k: __mmask8, a: __m512i) -> u64 {
    simd_reduce_max(simd_select_bitmask(
        k,
        a.as_u64x8(),
        _mm512_set1_epi64(0).as_u64x8(),
    ))
}

/// Reduce the packed single-precision (32-bit) floating-point elements in a by maximum. Returns the maximum of all elements in a.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_max_ps&expand=4586)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_reduce_max_ps(a: __m512) -> f32 {
    simd_reduce_max(a.as_f32x16())
}

/// Reduce the packed single-precision (32-bit) floating-point elements in a by maximum using mask k. Returns the maximum of all active elements in a.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_max_ps&expand=4585)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_mask_reduce_max_ps(k: __mmask16, a: __m512) -> f32 {
    simd_reduce_max(simd_select_bitmask(
        k,
        a.as_f32x16(),
        _mm512_undefined_ps().as_f32x16(),
    ))
}

/// Reduce the packed double-precision (64-bit) floating-point elements in a by maximum. Returns the maximum of all elements in a.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_max_pd&expand=4584)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_reduce_max_pd(a: __m512d) -> f64 {
    simd_reduce_max(a.as_f64x8())
}

/// Reduce the packed double-precision (64-bit) floating-point elements in a by maximum using mask k. Returns the maximum of all active elements in a.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_max_pd&expand=4583)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_mask_reduce_max_pd(k: __mmask8, a: __m512d) -> f64 {
    simd_reduce_max(simd_select_bitmask(
        k,
        a.as_f64x8(),
        _mm512_undefined_pd().as_f64x8(),
    ))
}

/// Reduce the packed signed 32-bit integers in a by minimum. Returns the minimum of all elements in a.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_min_epi32&expand=4588)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_reduce_min_epi32(a: __m512i) -> i32 {
    simd_reduce_min(a.as_i32x16())
}

/// Reduce the packed signed 32-bit integers in a by maximum using mask k. Returns the minimum of all active elements in a.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_min_epi32&expand=4587)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_mask_reduce_min_epi32(k: __mmask16, a: __m512i) -> i32 {
    simd_reduce_min(simd_select_bitmask(
        k,
        a.as_i32x16(),
        _mm512_undefined_epi32().as_i32x16(),
    ))
}

/// Reduce the packed signed 64-bit integers in a by minimum. Returns the minimum of all elements in a.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_min_epi64&expand=4590)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_reduce_min_epi64(a: __m512i) -> i64 {
    simd_reduce_min(a.as_i64x8())
}

/// Reduce the packed signed 64-bit integers in a by maximum using mask k. Returns the minimum of all active elements in a.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_min_epi64&expand=4589)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_mask_reduce_min_epi64(k: __mmask8, a: __m512i) -> i64 {
    simd_reduce_min(simd_select_bitmask(
        k,
        a.as_i64x8(),
        _mm512_set1_epi64(0).as_i64x8(),
    ))
}

/// Reduce the packed unsigned 32-bit integers in a by minimum. Returns the minimum of all elements in a.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_min_epu32&expand=4592)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_reduce_min_epu32(a: __m512i) -> u32 {
    simd_reduce_min(a.as_u32x16())
}

/// Reduce the packed unsigned 32-bit integers in a by maximum using mask k. Returns the minimum of all active elements in a.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_min_epu32&expand=4591)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_mask_reduce_min_epu32(k: __mmask16, a: __m512i) -> u32 {
    simd_reduce_min(simd_select_bitmask(
        k,
        a.as_u32x16(),
        _mm512_undefined_epi32().as_u32x16(),
    ))
}

/// Reduce the packed unsigned 64-bit integers in a by minimum. Returns the minimum of all elements in a.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_min_epu64&expand=4594)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_reduce_min_epu64(a: __m512i) -> u64 {
    simd_reduce_min(a.as_u64x8())
}

/// Reduce the packed signed 64-bit integers in a by maximum using mask k. Returns the minimum of all active elements in a.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_min_epi64&expand=4589)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_mask_reduce_min_epu64(k: __mmask8, a: __m512i) -> u64 {
    simd_reduce_min(simd_select_bitmask(
        k,
        a.as_u64x8(),
        _mm512_set1_epi64(0).as_u64x8(),
    ))
}

/// Reduce the packed single-precision (32-bit) floating-point elements in a by minimum. Returns the minimum of all elements in a.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_min_ps&expand=4598)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_reduce_min_ps(a: __m512) -> f32 {
    simd_reduce_min(a.as_f32x16())
}

/// Reduce the packed single-precision (32-bit) floating-point elements in a by maximum using mask k. Returns the minimum of all active elements in a.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_min_ps&expand=4597)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_mask_reduce_min_ps(k: __mmask16, a: __m512) -> f32 {
    simd_reduce_min(simd_select_bitmask(
        k,
        a.as_f32x16(),
        _mm512_undefined_ps().as_f32x16(),
    ))
}

/// Reduce the packed double-precision (64-bit) floating-point elements in a by minimum. Returns the minimum of all elements in a.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_min_pd&expand=4596)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_reduce_min_pd(a: __m512d) -> f64 {
    simd_reduce_min(a.as_f64x8())
}

/// Reduce the packed double-precision (64-bit) floating-point elements in a by maximum using mask k. Returns the minimum of all active elements in a.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_min_pd&expand=4595)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_mask_reduce_min_pd(k: __mmask8, a: __m512d) -> f64 {
    simd_reduce_min(simd_select_bitmask(
        k,
        a.as_f64x8(),
        _mm512_undefined_pd().as_f64x8(),
    ))
}

/// Reduce the packed 32-bit integers in a by bitwise AND. Returns the bitwise AND of all elements in a.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_and_epi32&expand=4564)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_reduce_and_epi32(a: __m512i) -> i32 {
    simd_reduce_and(a.as_i32x16())
}

/// Reduce the packed 32-bit integers in a by bitwise AND using mask k. Returns the bitwise AND of all active elements in a.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_and_epi32&expand=4563)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_mask_reduce_and_epi32(k: __mmask16, a: __m512i) -> i32 {
    simd_reduce_and(simd_select_bitmask(
        k,
        a.as_i32x16(),
        _mm512_set1_epi32(0xFF).as_i32x16(),
    ))
}

/// Reduce the packed 64-bit integers in a by bitwise AND. Returns the bitwise AND of all elements in a.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_and_epi64&expand=4566)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_reduce_and_epi64(a: __m512i) -> i64 {
    simd_reduce_and(a.as_i64x8())
}

/// Reduce the packed 64-bit integers in a by addition using mask k. Returns the sum of all active elements in a.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_add_epi64&expand=4557)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_mask_reduce_and_epi64(k: __mmask8, a: __m512i) -> i64 {
    simd_reduce_and(simd_select_bitmask(
        k,
        a.as_i64x8(),
        _mm512_set1_epi64(1 << 0 | 1 << 1 | 1 << 2 | 1 << 3 | 1 << 4 | 1 << 5 | 1 << 6 | 1 << 7)
            .as_i64x8(),
    ))
}

/// Reduce the packed 32-bit integers in a by bitwise OR. Returns the bitwise OR of all elements in a.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_or_epi32&expand=4608)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_reduce_or_epi32(a: __m512i) -> i32 {
    simd_reduce_or(a.as_i32x16())
}

/// Reduce the packed 32-bit integers in a by bitwise OR using mask k. Returns the bitwise OR of all active elements in a.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_or_epi32&expand=4607)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_mask_reduce_or_epi32(k: __mmask16, a: __m512i) -> i32 {
    simd_reduce_or(simd_select_bitmask(
        k,
        a.as_i32x16(),
        _mm512_setzero_si512().as_i32x16(),
    ))
}

/// Reduce the packed 64-bit integers in a by bitwise OR. Returns the bitwise OR of all elements in a.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_reduce_or_epi64&expand=4610)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_reduce_or_epi64(a: __m512i) -> i64 {
    simd_reduce_or(a.as_i64x8())
}

/// Reduce the packed 64-bit integers in a by bitwise OR using mask k. Returns the bitwise OR of all active elements in a.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_reduce_or_epi64&expand=4609)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_mask_reduce_or_epi64(k: __mmask8, a: __m512i) -> i64 {
    simd_reduce_or(simd_select_bitmask(
        k,
        a.as_i64x8(),
        _mm512_setzero_si512().as_i64x8(),
    ))
}

/// Returns vector of type `__m512d` with undefined elements.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_undefined_pd)
#[inline]
#[target_feature(enable = "avx512f")]
// This intrinsic has no corresponding instruction.
pub unsafe fn _mm512_undefined_pd() -> __m512d {
    _mm512_set1_pd(0.0)
}

/// Returns vector of type `__m512` with undefined elements.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_undefined_ps)
#[inline]
#[target_feature(enable = "avx512f")]
// This intrinsic has no corresponding instruction.
pub unsafe fn _mm512_undefined_ps() -> __m512 {
    _mm512_set1_ps(0.0)
}

/// Return vector of type __m512i with undefined elements.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_undefined_epi32&expand=5995)
#[inline]
#[target_feature(enable = "avx512f")]
// This intrinsic has no corresponding instruction.
pub unsafe fn _mm512_undefined_epi32() -> __m512i {
    _mm512_set1_epi32(0)
}

/// Return vector of type __m512 with undefined elements.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_undefined&expand=5994)
#[inline]
#[target_feature(enable = "avx512f")]
// This intrinsic has no corresponding instruction.
pub unsafe fn _mm512_undefined() -> __m512 {
    _mm512_set1_ps(0.0)
}

/// Load 512-bits (composed of 16 packed 32-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_loadu_epi32&expand=3377)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu32
pub unsafe fn _mm512_loadu_epi32(mem_addr: *const i32) -> __m512i {
    ptr::read_unaligned(mem_addr as *const __m512i)
}

/// Load 256-bits (composed of 8 packed 32-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_loadu_epi32&expand=3374)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu32
pub unsafe fn _mm256_loadu_epi32(mem_addr: *const i32) -> __m256i {
    ptr::read_unaligned(mem_addr as *const __m256i)
}

/// Load 128-bits (composed of 4 packed 32-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_epi32&expand=3371)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu32
pub unsafe fn _mm_loadu_epi32(mem_addr: *const i32) -> __m128i {
    ptr::read_unaligned(mem_addr as *const __m128i)
}

/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi32_storeu_epi16&expand=1460)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovdw))]
pub unsafe fn _mm512_mask_cvtepi32_storeu_epi16(mem_addr: *mut i8, k: __mmask16, a: __m512i) {
    vpmovdwmem(mem_addr as *mut i8, a.as_i32x16(), k);
}

/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi32_storeu_epi8&expand=1462)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovdw))]
pub unsafe fn _mm256_mask_cvtepi32_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
    vpmovdwmem256(mem_addr as *mut i8, a.as_i32x8(), k);
}

/// Convert packed 32-bit integers in a to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi32_storeu_epi8&expand=1461)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovdw))]
pub unsafe fn _mm_mask_cvtepi32_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
    vpmovdwmem128(mem_addr as *mut i8, a.as_i32x4(), k);
}

/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtsepi32_storeu_epi16&expand=1833)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovsdw))]
pub unsafe fn _mm512_mask_cvtsepi32_storeu_epi16(mem_addr: *mut i8, k: __mmask16, a: __m512i) {
    vpmovsdwmem(mem_addr as *mut i8, a.as_i32x16(), k);
}

/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtsepi32_storeu_epi16&expand=1832)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsdw))]
pub unsafe fn _mm256_mask_cvtsepi32_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
    vpmovsdwmem256(mem_addr as *mut i8, a.as_i32x8(), k);
}

/// Convert packed signed 32-bit integers in a to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsepi32_storeu_epi16&expand=1831)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsdw))]
pub unsafe fn _mm_mask_cvtsepi32_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
    vpmovsdwmem128(mem_addr as *mut i8, a.as_i32x4(), k);
}

/// Convert packed unsigned 32-bit integers in a to packed 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtusepi32_storeu_epi16&expand=2068)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovusdw))]
pub unsafe fn _mm512_mask_cvtusepi32_storeu_epi16(mem_addr: *mut i8, k: __mmask16, a: __m512i) {
    vpmovusdwmem(mem_addr as *mut i8, a.as_i32x16(), k);
}

/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtusepi32_storeu_epi16&expand=2067)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovusdw))]
pub unsafe fn _mm256_mask_cvtusepi32_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
    vpmovusdwmem256(mem_addr as *mut i8, a.as_i32x8(), k);
}

/// Convert packed unsigned 32-bit integers in a to packed unsigned 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtusepi32_storeu_epi16&expand=2066)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovusdw))]
pub unsafe fn _mm_mask_cvtusepi32_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
    vpmovusdwmem128(mem_addr as *mut i8, a.as_i32x4(), k);
}

/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi32_storeu_epi8&expand=1463)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovdb))]
pub unsafe fn _mm512_mask_cvtepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask16, a: __m512i) {
    vpmovdbmem(mem_addr as *mut i8, a.as_i32x16(), k);
}

/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi32_storeu_epi8&expand=1462)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovdb))]
pub unsafe fn _mm256_mask_cvtepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
    vpmovdbmem256(mem_addr as *mut i8, a.as_i32x8(), k);
}

/// Convert packed 32-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi32_storeu_epi8&expand=1461)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovdb))]
pub unsafe fn _mm_mask_cvtepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
    vpmovdbmem128(mem_addr as *mut i8, a.as_i32x4(), k);
}

/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtsepi32_storeu_epi8&expand=1836)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovsdb))]
pub unsafe fn _mm512_mask_cvtsepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask16, a: __m512i) {
    vpmovsdbmem(mem_addr as *mut i8, a.as_i32x16(), k);
}

/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtsepi32_storeu_epi8&expand=1835)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsdb))]
pub unsafe fn _mm256_mask_cvtsepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
    vpmovsdbmem256(mem_addr as *mut i8, a.as_i32x8(), k);
}

/// Convert packed signed 32-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsepi32_storeu_epi8&expand=1834)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsdb))]
pub unsafe fn _mm_mask_cvtsepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
    vpmovsdbmem128(mem_addr as *mut i8, a.as_i32x4(), k);
}

/// Convert packed unsigned 32-bit integers in a to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtusepi32_storeu_epi8&expand=2071)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovusdb))]
pub unsafe fn _mm512_mask_cvtusepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask16, a: __m512i) {
    vpmovusdbmem(mem_addr as *mut i8, a.as_i32x16(), k);
}

/// Convert packed unsigned 32-bit integers in a to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtusepi32_storeu_epi8&expand=2070)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovusdb))]
pub unsafe fn _mm256_mask_cvtusepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
    vpmovusdbmem256(mem_addr as *mut i8, a.as_i32x8(), k);
}

/// Convert packed unsigned 32-bit integers in a to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtusepi32_storeu_epi8&expand=2069)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovusdb))]
pub unsafe fn _mm_mask_cvtusepi32_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
    vpmovusdbmem128(mem_addr as *mut i8, a.as_i32x4(), k);
}

/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi64_storeu_epi16&expand=1513)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovqw))]
pub unsafe fn _mm512_mask_cvtepi64_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a: __m512i) {
    vpmovqwmem(mem_addr as *mut i8, a.as_i64x8(), k);
}

/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi64_storeu_epi16&expand=1512)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovqw))]
pub unsafe fn _mm256_mask_cvtepi64_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
    vpmovqwmem256(mem_addr as *mut i8, a.as_i64x4(), k);
}

/// Convert packed 64-bit integers in a to packed 16-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi64_storeu_epi16&expand=1511)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovqw))]
pub unsafe fn _mm_mask_cvtepi64_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
    vpmovqwmem128(mem_addr as *mut i8, a.as_i64x2(), k);
}

/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtsepi64_storeu_epi16&expand=1866)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovsqw))]
pub unsafe fn _mm512_mask_cvtsepi64_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a: __m512i) {
    vpmovsqwmem(mem_addr as *mut i8, a.as_i64x8(), k);
}

/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtsepi64_storeu_epi16&expand=1865)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsqw))]
pub unsafe fn _mm256_mask_cvtsepi64_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
    vpmovsqwmem256(mem_addr as *mut i8, a.as_i64x4(), k);
}

/// Convert packed signed 64-bit integers in a to packed 16-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsepi64_storeu_epi16&expand=1864)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsqw))]
pub unsafe fn _mm_mask_cvtsepi64_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
    vpmovsqwmem128(mem_addr as *mut i8, a.as_i64x2(), k);
}

/// Convert packed unsigned 64-bit integers in a to packed 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtusepi64_storeu_epi16&expand=2101)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovusqw))]
pub unsafe fn _mm512_mask_cvtusepi64_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a: __m512i) {
    vpmovusqwmem(mem_addr as *mut i8, a.as_i64x8(), k);
}

/// Convert packed unsigned 64-bit integers in a to packed 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtusepi64_storeu_epi16&expand=2100)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovusqw))]
pub unsafe fn _mm256_mask_cvtusepi64_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
    vpmovusqwmem256(mem_addr as *mut i8, a.as_i64x4(), k);
}

/// Convert packed unsigned 64-bit integers in a to packed 16-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtusepi64_storeu_epi16&expand=2099)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovusqw))]
pub unsafe fn _mm_mask_cvtusepi64_storeu_epi16(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
    vpmovusqwmem128(mem_addr as *mut i8, a.as_i64x2(), k);
}

/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi64_storeu_epi8&expand=1519)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovqb))]
pub unsafe fn _mm512_mask_cvtepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m512i) {
    vpmovqbmem(mem_addr as *mut i8, a.as_i64x8(), k);
}

/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi64_storeu_epi8&expand=1518)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovqb))]
pub unsafe fn _mm256_mask_cvtepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
    vpmovqbmem256(mem_addr as *mut i8, a.as_i64x4(), k);
}

/// Convert packed 64-bit integers in a to packed 8-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi64_storeu_epi8&expand=1517)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovqb))]
pub unsafe fn _mm_mask_cvtepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
    vpmovqbmem128(mem_addr as *mut i8, a.as_i64x2(), k);
}

/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtsepi64_storeu_epi8&expand=1872)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovsqb))]
pub unsafe fn _mm512_mask_cvtsepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m512i) {
    vpmovsqbmem(mem_addr as *mut i8, a.as_i64x8(), k);
}

/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtsepi64_storeu_epi8&expand=1871)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsqb))]
pub unsafe fn _mm256_mask_cvtsepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
    vpmovsqbmem256(mem_addr as *mut i8, a.as_i64x4(), k);
}

/// Convert packed signed 64-bit integers in a to packed 8-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsepi64_storeu_epi8&expand=1870)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsqb))]
pub unsafe fn _mm_mask_cvtsepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
    vpmovsqbmem128(mem_addr as *mut i8, a.as_i64x2(), k);
}

/// Convert packed unsigned 64-bit integers in a to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtusepi64_storeu_epi8&expand=2107)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovusqb))]
pub unsafe fn _mm512_mask_cvtusepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m512i) {
    vpmovusqbmem(mem_addr as *mut i8, a.as_i64x8(), k);
}

/// Convert packed unsigned 64-bit integers in a to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtusepi64_storeu_epi8&expand=2106)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovusqb))]
pub unsafe fn _mm256_mask_cvtusepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
    vpmovusqbmem256(mem_addr as *mut i8, a.as_i64x4(), k);
}

/// Convert packed unsigned 64-bit integers in a to packed 8-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtusepi64_storeu_epi8&expand=2105)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovusqb))]
pub unsafe fn _mm_mask_cvtusepi64_storeu_epi8(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
    vpmovusqbmem128(mem_addr as *mut i8, a.as_i64x2(), k);
}

///Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtepi64_storeu_epi32&expand=1516)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovqd))]
pub unsafe fn _mm512_mask_cvtepi64_storeu_epi32(mem_addr: *mut i8, k: __mmask8, a: __m512i) {
    vpmovqdmem(mem_addr as *mut i8, a.as_i64x8(), k);
}

///Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtepi64_storeu_epi32&expand=1515)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovqd))]
pub unsafe fn _mm256_mask_cvtepi64_storeu_epi32(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
    vpmovqdmem256(mem_addr as *mut i8, a.as_i64x4(), k);
}

///Convert packed 64-bit integers in a to packed 32-bit integers with truncation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtepi64_storeu_epi32&expand=1514)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovqd))]
pub unsafe fn _mm_mask_cvtepi64_storeu_epi32(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
    vpmovqdmem128(mem_addr as *mut i8, a.as_i64x2(), k);
}

/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtsepi64_storeu_epi32&expand=1869)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovsqd))]
pub unsafe fn _mm512_mask_cvtsepi64_storeu_epi32(mem_addr: *mut i8, k: __mmask8, a: __m512i) {
    vpmovsqdmem(mem_addr as *mut i8, a.as_i64x8(), k);
}

/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtsepi64_storeu_epi32&expand=1868)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsqd))]
pub unsafe fn _mm256_mask_cvtsepi64_storeu_epi32(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
    vpmovsqdmem256(mem_addr as *mut i8, a.as_i64x4(), k);
}

/// Convert packed signed 64-bit integers in a to packed 32-bit integers with signed saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtsepi64_storeu_epi32&expand=1867)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovsqd))]
pub unsafe fn _mm_mask_cvtsepi64_storeu_epi32(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
    vpmovsqdmem128(mem_addr as *mut i8, a.as_i64x2(), k);
}

/// Convert packed unsigned 64-bit integers in a to packed 32-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_cvtusepi64_storeu_epi32&expand=2104)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vpmovusqd))]
pub unsafe fn _mm512_mask_cvtusepi64_storeu_epi32(mem_addr: *mut i8, k: __mmask8, a: __m512i) {
    vpmovusqdmem(mem_addr as *mut i8, a.as_i64x8(), k);
}

/// Convert packed unsigned 64-bit integers in a to packed 32-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_cvtusepi64_storeu_epi32&expand=2103)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovusqd))]
pub unsafe fn _mm256_mask_cvtusepi64_storeu_epi32(mem_addr: *mut i8, k: __mmask8, a: __m256i) {
    vpmovusqdmem256(mem_addr as *mut i8, a.as_i64x4(), k);
}

/// Convert packed unsigned 64-bit integers in a to packed 32-bit integers with unsigned saturation, and store the active results (those with their respective bit set in writemask k) to unaligned memory at base_addr.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_cvtusepi64_storeu_epi32&expand=2102)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vpmovusqd))]
pub unsafe fn _mm_mask_cvtusepi64_storeu_epi32(mem_addr: *mut i8, k: __mmask8, a: __m128i) {
    vpmovusqdmem128(mem_addr as *mut i8, a.as_i64x2(), k);
}

/// Store 512-bits (composed of 16 packed 32-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_storeu_epi32&expand=5628)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu32
pub unsafe fn _mm512_storeu_epi32(mem_addr: *mut i32, a: __m512i) {
    ptr::write_unaligned(mem_addr as *mut __m512i, a);
}

/// Store 256-bits (composed of 8 packed 32-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_storeu_epi32&expand=5626)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu32
pub unsafe fn _mm256_storeu_epi32(mem_addr: *mut i32, a: __m256i) {
    ptr::write_unaligned(mem_addr as *mut __m256i, a);
}

/// Store 128-bits (composed of 4 packed 32-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_epi32&expand=5624)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu32
pub unsafe fn _mm_storeu_epi32(mem_addr: *mut i32, a: __m128i) {
    ptr::write_unaligned(mem_addr as *mut __m128i, a);
}

/// Load 512-bits (composed of 8 packed 64-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_loadu_epi64&expand=3386)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu64
pub unsafe fn _mm512_loadu_epi64(mem_addr: *const i64) -> __m512i {
    ptr::read_unaligned(mem_addr as *const __m512i)
}

/// Load 256-bits (composed of 4 packed 64-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_loadu_epi64&expand=3383)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu64
pub unsafe fn _mm256_loadu_epi64(mem_addr: *const i64) -> __m256i {
    ptr::read_unaligned(mem_addr as *const __m256i)
}

/// Load 128-bits (composed of 2 packed 64-bit integers) from memory into dst. mem_addr does not need to be aligned on any particular boundary.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_epi64&expand=3380)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu64
pub unsafe fn _mm_loadu_epi64(mem_addr: *const i64) -> __m128i {
    ptr::read_unaligned(mem_addr as *const __m128i)
}

/// Store 512-bits (composed of 8 packed 64-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_storeu_epi64&expand=5634)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu64
pub unsafe fn _mm512_storeu_epi64(mem_addr: *mut i64, a: __m512i) {
    ptr::write_unaligned(mem_addr as *mut __m512i, a);
}

/// Store 256-bits (composed of 4 packed 64-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_storeu_epi64&expand=5632)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu64
pub unsafe fn _mm256_storeu_epi64(mem_addr: *mut i64, a: __m256i) {
    ptr::write_unaligned(mem_addr as *mut __m256i, a);
}

/// Store 128-bits (composed of 2 packed 64-bit integers) from a into memory. mem_addr does not need to be aligned on any particular boundary.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_epi64&expand=5630)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu64
pub unsafe fn _mm_storeu_epi64(mem_addr: *mut i64, a: __m128i) {
    ptr::write_unaligned(mem_addr as *mut __m128i, a);
}

/// Load 512-bits of integer data from memory into dst. mem_addr does not need to be aligned on any particular boundary.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_loadu_si512&expand=3420)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu32
pub unsafe fn _mm512_loadu_si512(mem_addr: *const i32) -> __m512i {
    ptr::read_unaligned(mem_addr as *const __m512i)
}

/// Store 512-bits of integer data from a into memory. mem_addr does not need to be aligned on any particular boundary.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_storeu_si512&expand=5657)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmovups))] //should be vmovdqu32
pub unsafe fn _mm512_storeu_si512(mem_addr: *mut i32, a: __m512i) {
    ptr::write_unaligned(mem_addr as *mut __m512i, a);
}

/// Loads 512-bits (composed of 8 packed double-precision (64-bit)
/// floating-point elements) from memory into result.
/// `mem_addr` does not need to be aligned on any particular boundary.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_loadu_pd)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmovups))]
pub unsafe fn _mm512_loadu_pd(mem_addr: *const f64) -> __m512d {
    ptr::read_unaligned(mem_addr as *const __m512d)
}

/// Stores 512-bits (composed of 8 packed double-precision (64-bit)
/// floating-point elements) from `a` into memory.
/// `mem_addr` does not need to be aligned on any particular boundary.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_storeu_pd)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmovups))]
pub unsafe fn _mm512_storeu_pd(mem_addr: *mut f64, a: __m512d) {
    ptr::write_unaligned(mem_addr as *mut __m512d, a);
}

/// Loads 512-bits (composed of 16 packed single-precision (32-bit)
/// floating-point elements) from memory into result.
/// `mem_addr` does not need to be aligned on any particular boundary.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_loadu_ps)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmovups))]
pub unsafe fn _mm512_loadu_ps(mem_addr: *const f32) -> __m512 {
    ptr::read_unaligned(mem_addr as *const __m512)
}

/// Stores 512-bits (composed of 16 packed single-precision (32-bit)
/// floating-point elements) from `a` into memory.
/// `mem_addr` does not need to be aligned on any particular boundary.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_storeu_ps)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmovups))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub unsafe fn _mm512_storeu_ps(mem_addr: *mut f32, a: __m512) {
    ptr::write_unaligned(mem_addr as *mut __m512, a);
}

/// Load 512-bits of integer data from memory into dst. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_load_si512&expand=3345)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa32
pub unsafe fn _mm512_load_si512(mem_addr: *const i32) -> __m512i {
    ptr::read(mem_addr as *const __m512i)
}

/// Store 512-bits of integer data from a into memory. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_store_si512&expand=5598)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa32
pub unsafe fn _mm512_store_si512(mem_addr: *mut i32, a: __m512i) {
    ptr::write(mem_addr as *mut __m512i, a);
}

/// Load 512-bits (composed of 16 packed 32-bit integers) from memory into dst. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_load_epi32&expand=3304)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa32
pub unsafe fn _mm512_load_epi32(mem_addr: *const i32) -> __m512i {
    ptr::read(mem_addr as *const __m512i)
}

/// Load 256-bits (composed of 8 packed 32-bit integers) from memory into dst. mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_load_epi32&expand=3301)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa32
pub unsafe fn _mm256_load_epi32(mem_addr: *const i32) -> __m256i {
    ptr::read(mem_addr as *const __m256i)
}

/// Load 128-bits (composed of 4 packed 32-bit integers) from memory into dst. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_epi32&expand=3298)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa32
pub unsafe fn _mm_load_epi32(mem_addr: *const i32) -> __m128i {
    ptr::read(mem_addr as *const __m128i)
}

/// Store 512-bits (composed of 16 packed 32-bit integers) from a into memory. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=512_store_epi32&expand=5569)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa32
pub unsafe fn _mm512_store_epi32(mem_addr: *mut i32, a: __m512i) {
    ptr::write(mem_addr as *mut __m512i, a);
}

/// Store 256-bits (composed of 8 packed 32-bit integers) from a into memory. mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_store_epi32&expand=5567)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa32
pub unsafe fn _mm256_store_epi32(mem_addr: *mut i32, a: __m256i) {
    ptr::write(mem_addr as *mut __m256i, a);
}

/// Store 128-bits (composed of 4 packed 32-bit integers) from a into memory. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_epi32&expand=5565)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa32
pub unsafe fn _mm_store_epi32(mem_addr: *mut i32, a: __m128i) {
    ptr::write(mem_addr as *mut __m128i, a);
}

/// Load 512-bits (composed of 8 packed 64-bit integers) from memory into dst. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_load_epi64&expand=3313)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa64
pub unsafe fn _mm512_load_epi64(mem_addr: *const i64) -> __m512i {
    ptr::read(mem_addr as *const __m512i)
}

/// Load 256-bits (composed of 4 packed 64-bit integers) from memory into dst. mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_load_epi64&expand=3310)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa64
pub unsafe fn _mm256_load_epi64(mem_addr: *const i64) -> __m256i {
    ptr::read(mem_addr as *const __m256i)
}

/// Load 128-bits (composed of 2 packed 64-bit integers) from memory into dst. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_epi64&expand=3307)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa64
pub unsafe fn _mm_load_epi64(mem_addr: *const i64) -> __m128i {
    ptr::read(mem_addr as *const __m128i)
}

/// Store 512-bits (composed of 8 packed 64-bit integers) from a into memory. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_store_epi64&expand=5575)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa64
pub unsafe fn _mm512_store_epi64(mem_addr: *mut i64, a: __m512i) {
    ptr::write(mem_addr as *mut __m512i, a);
}

/// Store 256-bits (composed of 4 packed 64-bit integers) from a into memory. mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_store_epi64&expand=5573)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa64
pub unsafe fn _mm256_store_epi64(mem_addr: *mut i64, a: __m256i) {
    ptr::write(mem_addr as *mut __m256i, a);
}

/// Store 128-bits (composed of 2 packed 64-bit integers) from a into memory. mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_epi64&expand=5571)
#[inline]
#[target_feature(enable = "avx512f,avx512vl")]
#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovdqa64
pub unsafe fn _mm_store_epi64(mem_addr: *mut i64, a: __m128i) {
    ptr::write(mem_addr as *mut __m128i, a);
}

/// Load 512-bits (composed of 16 packed single-precision (32-bit) floating-point elements) from memory into dst. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_load_ps&expand=3336)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmovaps))]
pub unsafe fn _mm512_load_ps(mem_addr: *const f32) -> __m512 {
    ptr::read(mem_addr as *const __m512)
}

/// Store 512-bits of integer data from a into memory. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_store_ps&expand=5592)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmovaps))]
pub unsafe fn _mm512_store_ps(mem_addr: *mut f32, a: __m512) {
    ptr::write(mem_addr as *mut __m512, a);
}

/// Load 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from memory into dst. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_load_pd&expand=3326)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovapd
pub unsafe fn _mm512_load_pd(mem_addr: *const f64) -> __m512d {
    ptr::read(mem_addr as *const __m512d)
}

/// Store 512-bits (composed of 8 packed double-precision (64-bit) floating-point elements) from a into memory. mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_store_pd&expand=5585)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmovaps))] //should be vmovapd
pub unsafe fn _mm512_store_pd(mem_addr: *mut f64, a: __m512d) {
    ptr::write(mem_addr as *mut __m512d, a);
}

/// Load packed 32-bit integers from memory into dst using writemask k
/// (elements are copied from src when the corresponding mask bit is not set).
/// mem_addr does not need to be aligned on any particular boundary.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_loadu_epi32)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_mask_loadu_epi32(src: __m512i, k: __mmask16, mem_addr: *const i32) -> __m512i {
    let mut dst: __m512i = src;
    asm!(
        vpl!("vmovdqu32 {dst}{{{k}}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = inout(zmm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load packed 32-bit integers from memory into dst using zeromask k
/// (elements are zeroed out when the corresponding mask bit is not set).
/// mem_addr does not need to be aligned on any particular boundary.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_loadu_epi32)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_maskz_loadu_epi32(k: __mmask16, mem_addr: *const i32) -> __m512i {
    let mut dst: __m512i;
    asm!(
        vpl!("vmovdqu32 {dst}{{{k}}} {{z}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = out(zmm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load packed 64-bit integers from memory into dst using writemask k
/// (elements are copied from src when the corresponding mask bit is not set).
/// mem_addr does not need to be aligned on any particular boundary.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_loadu_epi64)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_mask_loadu_epi64(src: __m512i, k: __mmask8, mem_addr: *const i64) -> __m512i {
    let mut dst: __m512i = src;
    asm!(
        vpl!("vmovdqu64 {dst}{{{k}}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = inout(zmm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load packed 64-bit integers from memory into dst using zeromask k
/// (elements are zeroed out when the corresponding mask bit is not set).
/// mem_addr does not need to be aligned on any particular boundary.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_loadu_epi64)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_maskz_loadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m512i {
    let mut dst: __m512i;
    asm!(
        vpl!("vmovdqu64 {dst}{{{k}}} {{z}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = out(zmm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k
/// (elements are copied from src when the corresponding mask bit is not set).
/// mem_addr does not need to be aligned on any particular boundary.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_loadu_ps)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_mask_loadu_ps(src: __m512, k: __mmask16, mem_addr: *const f32) -> __m512 {
    let mut dst: __m512 = src;
    asm!(
        vpl!("vmovups {dst}{{{k}}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = inout(zmm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k
/// (elements are zeroed out when the corresponding mask bit is not set).
/// mem_addr does not need to be aligned on any particular boundary.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_loadu_ps)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_maskz_loadu_ps(k: __mmask16, mem_addr: *const f32) -> __m512 {
    let mut dst: __m512;
    asm!(
        vpl!("vmovups {dst}{{{k}}} {{z}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = out(zmm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k
/// (elements are copied from src when the corresponding mask bit is not set).
/// mem_addr does not need to be aligned on any particular boundary.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_loadu_pd)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_mask_loadu_pd(src: __m512d, k: __mmask8, mem_addr: *const f64) -> __m512d {
    let mut dst: __m512d = src;
    asm!(
        vpl!("vmovupd {dst}{{{k}}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = inout(zmm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k
/// (elements are zeroed out when the corresponding mask bit is not set).
/// mem_addr does not need to be aligned on any particular boundary.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_loadu_pd)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_maskz_loadu_pd(k: __mmask8, mem_addr: *const f64) -> __m512d {
    let mut dst: __m512d;
    asm!(
        vpl!("vmovupd {dst}{{{k}}} {{z}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = out(zmm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load packed 32-bit integers from memory into dst using writemask k
/// (elements are copied from src when the corresponding mask bit is not set).
/// mem_addr does not need to be aligned on any particular boundary.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_loadu_epi32)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx")]
pub unsafe fn _mm256_mask_loadu_epi32(src: __m256i, k: __mmask8, mem_addr: *const i32) -> __m256i {
    let mut dst: __m256i = src;
    asm!(
        vpl!("vmovdqu32 {dst}{{{k}}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = inout(ymm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load packed 32-bit integers from memory into dst using zeromask k
/// (elements are zeroed out when the corresponding mask bit is not set).
/// mem_addr does not need to be aligned on any particular boundary.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_loadu_epi32)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx")]
pub unsafe fn _mm256_maskz_loadu_epi32(k: __mmask8, mem_addr: *const i32) -> __m256i {
    let mut dst: __m256i;
    asm!(
        vpl!("vmovdqu32 {dst}{{{k}}} {{z}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = out(ymm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load packed 64-bit integers from memory into dst using writemask k
/// (elements are copied from src when the corresponding mask bit is not set).
/// mem_addr does not need to be aligned on any particular boundary.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_loadu_epi64)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx")]
pub unsafe fn _mm256_mask_loadu_epi64(src: __m256i, k: __mmask8, mem_addr: *const i64) -> __m256i {
    let mut dst: __m256i = src;
    asm!(
        vpl!("vmovdqu64 {dst}{{{k}}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = inout(ymm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load packed 64-bit integers from memory into dst using zeromask k
/// (elements are zeroed out when the corresponding mask bit is not set).
/// mem_addr does not need to be aligned on any particular boundary.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_loadu_epi64)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx")]
pub unsafe fn _mm256_maskz_loadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m256i {
    let mut dst: __m256i;
    asm!(
        vpl!("vmovdqu64 {dst}{{{k}}} {{z}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = out(ymm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k
/// (elements are copied from src when the corresponding mask bit is not set).
/// mem_addr does not need to be aligned on any particular boundary.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_loadu_ps)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx")]
pub unsafe fn _mm256_mask_loadu_ps(src: __m256, k: __mmask8, mem_addr: *const f32) -> __m256 {
    let mut dst: __m256 = src;
    asm!(
        vpl!("vmovups {dst}{{{k}}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = inout(ymm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k
/// (elements are zeroed out when the corresponding mask bit is not set).
/// mem_addr does not need to be aligned on any particular boundary.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_loadu_ps)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx")]
pub unsafe fn _mm256_maskz_loadu_ps(k: __mmask8, mem_addr: *const f32) -> __m256 {
    let mut dst: __m256;
    asm!(
        vpl!("vmovups {dst}{{{k}}} {{z}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = out(ymm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k
/// (elements are copied from src when the corresponding mask bit is not set).
/// mem_addr does not need to be aligned on any particular boundary.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_loadu_pd)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx")]
pub unsafe fn _mm256_mask_loadu_pd(src: __m256d, k: __mmask8, mem_addr: *const f64) -> __m256d {
    let mut dst: __m256d = src;
    asm!(
        vpl!("vmovupd {dst}{{{k}}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = inout(ymm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k
/// (elements are zeroed out when the corresponding mask bit is not set).
/// mem_addr does not need to be aligned on any particular boundary.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_loadu_pd)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx")]
pub unsafe fn _mm256_maskz_loadu_pd(k: __mmask8, mem_addr: *const f64) -> __m256d {
    let mut dst: __m256d;
    asm!(
        vpl!("vmovupd {dst}{{{k}}} {{z}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = out(ymm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load packed 32-bit integers from memory into dst using writemask k
/// (elements are copied from src when the corresponding mask bit is not set).
/// mem_addr does not need to be aligned on any particular boundary.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_loadu_epi32)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
pub unsafe fn _mm_mask_loadu_epi32(src: __m128i, k: __mmask8, mem_addr: *const i32) -> __m128i {
    let mut dst: __m128i = src;
    asm!(
        vpl!("vmovdqu32 {dst}{{{k}}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = inout(xmm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load packed 32-bit integers from memory into dst using zeromask k
/// (elements are zeroed out when the corresponding mask bit is not set).
/// mem_addr does not need to be aligned on any particular boundary.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_loadu_epi32)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
pub unsafe fn _mm_maskz_loadu_epi32(k: __mmask8, mem_addr: *const i32) -> __m128i {
    let mut dst: __m128i;
    asm!(
        vpl!("vmovdqu32 {dst}{{{k}}} {{z}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = out(xmm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load packed 64-bit integers from memory into dst using writemask k
/// (elements are copied from src when the corresponding mask bit is not set).
/// mem_addr does not need to be aligned on any particular boundary.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_loadu_epi64)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
pub unsafe fn _mm_mask_loadu_epi64(src: __m128i, k: __mmask8, mem_addr: *const i64) -> __m128i {
    let mut dst: __m128i = src;
    asm!(
        vpl!("vmovdqu64 {dst}{{{k}}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = inout(xmm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load packed 64-bit integers from memory into dst using zeromask k
/// (elements are zeroed out when the corresponding mask bit is not set).
/// mem_addr does not need to be aligned on any particular boundary.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_loadu_epi64)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
pub unsafe fn _mm_maskz_loadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m128i {
    let mut dst: __m128i;
    asm!(
        vpl!("vmovdqu64 {dst}{{{k}}} {{z}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = out(xmm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k
/// (elements are copied from src when the corresponding mask bit is not set).
/// mem_addr does not need to be aligned on any particular boundary.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_loadu_ps)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
pub unsafe fn _mm_mask_loadu_ps(src: __m128, k: __mmask8, mem_addr: *const f32) -> __m128 {
    let mut dst: __m128 = src;
    asm!(
        vpl!("vmovups {dst}{{{k}}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = inout(xmm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k
/// (elements are zeroed out when the corresponding mask bit is not set).
/// mem_addr does not need to be aligned on any particular boundary.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_loadu_ps)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
pub unsafe fn _mm_maskz_loadu_ps(k: __mmask8, mem_addr: *const f32) -> __m128 {
    let mut dst: __m128;
    asm!(
        vpl!("vmovups {dst}{{{k}}} {{z}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = out(xmm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k
/// (elements are copied from src when the corresponding mask bit is not set).
/// mem_addr does not need to be aligned on any particular boundary.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_loadu_pd)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
pub unsafe fn _mm_mask_loadu_pd(src: __m128d, k: __mmask8, mem_addr: *const f64) -> __m128d {
    let mut dst: __m128d = src;
    asm!(
        vpl!("vmovupd {dst}{{{k}}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = inout(xmm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k
/// (elements are zeroed out when the corresponding mask bit is not set).
/// mem_addr does not need to be aligned on any particular boundary.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_loadu_pd)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
pub unsafe fn _mm_maskz_loadu_pd(k: __mmask8, mem_addr: *const f64) -> __m128d {
    let mut dst: __m128d;
    asm!(
        vpl!("vmovupd {dst}{{{k}}} {{z}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = out(xmm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load packed 32-bit integers from memory into dst using writemask k
/// (elements are copied from src when the corresponding mask bit is not set).
/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_load_epi32)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_mask_load_epi32(src: __m512i, k: __mmask16, mem_addr: *const i32) -> __m512i {
    let mut dst: __m512i = src;
    asm!(
        vpl!("vmovdqa32 {dst}{{{k}}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = inout(zmm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load packed 32-bit integers from memory into dst using zeromask k
/// (elements are zeroed out when the corresponding mask bit is not set).
/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_load_epi32)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_maskz_load_epi32(k: __mmask16, mem_addr: *const i32) -> __m512i {
    let mut dst: __m512i;
    asm!(
        vpl!("vmovdqa32 {dst}{{{k}}} {{z}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = out(zmm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load packed 64-bit integers from memory into dst using writemask k
/// (elements are copied from src when the corresponding mask bit is not set).
/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_load_epi64)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_mask_load_epi64(src: __m512i, k: __mmask8, mem_addr: *const i64) -> __m512i {
    let mut dst: __m512i = src;
    asm!(
        vpl!("vmovdqa64 {dst}{{{k}}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = inout(zmm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load packed 64-bit integers from memory into dst using zeromask k
/// (elements are zeroed out when the corresponding mask bit is not set).
/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_load_epi64)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_maskz_load_epi64(k: __mmask8, mem_addr: *const i64) -> __m512i {
    let mut dst: __m512i;
    asm!(
        vpl!("vmovdqa64 {dst}{{{k}}} {{z}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = out(zmm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k
/// (elements are copied from src when the corresponding mask bit is not set).
/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_load_ps)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_mask_load_ps(src: __m512, k: __mmask16, mem_addr: *const f32) -> __m512 {
    let mut dst: __m512 = src;
    asm!(
        vpl!("vmovaps {dst}{{{k}}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = inout(zmm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k
/// (elements are zeroed out when the corresponding mask bit is not set).
/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_load_ps)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_maskz_load_ps(k: __mmask16, mem_addr: *const f32) -> __m512 {
    let mut dst: __m512;
    asm!(
        vpl!("vmovaps {dst}{{{k}}} {{z}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = out(zmm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k
/// (elements are copied from src when the corresponding mask bit is not set).
/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_load_pd)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_mask_load_pd(src: __m512d, k: __mmask8, mem_addr: *const f64) -> __m512d {
    let mut dst: __m512d = src;
    asm!(
        vpl!("vmovapd {dst}{{{k}}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = inout(zmm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k
/// (elements are zeroed out when the corresponding mask bit is not set).
/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_load_pd)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_maskz_load_pd(k: __mmask8, mem_addr: *const f64) -> __m512d {
    let mut dst: __m512d;
    asm!(
        vpl!("vmovapd {dst}{{{k}}} {{z}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = out(zmm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load packed 32-bit integers from memory into dst using writemask k
/// (elements are copied from src when the corresponding mask bit is not set).
/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_load_epi32)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx")]
pub unsafe fn _mm256_mask_load_epi32(src: __m256i, k: __mmask8, mem_addr: *const i32) -> __m256i {
    let mut dst: __m256i = src;
    asm!(
        vpl!("vmovdqa32 {dst}{{{k}}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = inout(ymm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load packed 32-bit integers from memory into dst using zeromask k
/// (elements are zeroed out when the corresponding mask bit is not set).
/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_load_epi32)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx")]
pub unsafe fn _mm256_maskz_load_epi32(k: __mmask8, mem_addr: *const i32) -> __m256i {
    let mut dst: __m256i;
    asm!(
        vpl!("vmovdqa32 {dst}{{{k}}} {{z}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = out(ymm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load packed 64-bit integers from memory into dst using writemask k
/// (elements are copied from src when the corresponding mask bit is not set).
/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_load_epi64)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx")]
pub unsafe fn _mm256_mask_load_epi64(src: __m256i, k: __mmask8, mem_addr: *const i64) -> __m256i {
    let mut dst: __m256i = src;
    asm!(
        vpl!("vmovdqa64 {dst}{{{k}}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = inout(ymm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load packed 64-bit integers from memory into dst using zeromask k
/// (elements are zeroed out when the corresponding mask bit is not set).
/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_load_epi64)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx")]
pub unsafe fn _mm256_maskz_load_epi64(k: __mmask8, mem_addr: *const i64) -> __m256i {
    let mut dst: __m256i;
    asm!(
        vpl!("vmovdqa64 {dst}{{{k}}} {{z}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = out(ymm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k
/// (elements are copied from src when the corresponding mask bit is not set).
/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_load_ps)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx")]
pub unsafe fn _mm256_mask_load_ps(src: __m256, k: __mmask8, mem_addr: *const f32) -> __m256 {
    let mut dst: __m256 = src;
    asm!(
        vpl!("vmovaps {dst}{{{k}}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = inout(ymm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k
/// (elements are zeroed out when the corresponding mask bit is not set).
/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_load_ps)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx")]
pub unsafe fn _mm256_maskz_load_ps(k: __mmask8, mem_addr: *const f32) -> __m256 {
    let mut dst: __m256;
    asm!(
        vpl!("vmovaps {dst}{{{k}}} {{z}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = out(ymm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k
/// (elements are copied from src when the corresponding mask bit is not set).
/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_load_pd)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx")]
pub unsafe fn _mm256_mask_load_pd(src: __m256d, k: __mmask8, mem_addr: *const f64) -> __m256d {
    let mut dst: __m256d = src;
    asm!(
        vpl!("vmovapd {dst}{{{k}}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = inout(ymm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k
/// (elements are zeroed out when the corresponding mask bit is not set).
/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_load_pd)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx")]
pub unsafe fn _mm256_maskz_load_pd(k: __mmask8, mem_addr: *const f64) -> __m256d {
    let mut dst: __m256d;
    asm!(
        vpl!("vmovapd {dst}{{{k}}} {{z}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = out(ymm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load packed 32-bit integers from memory into dst using writemask k
/// (elements are copied from src when the corresponding mask bit is not set).
/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_epi32)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
pub unsafe fn _mm_mask_load_epi32(src: __m128i, k: __mmask8, mem_addr: *const i32) -> __m128i {
    let mut dst: __m128i = src;
    asm!(
        vpl!("vmovdqa32 {dst}{{{k}}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = inout(xmm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load packed 32-bit integers from memory into dst using zeromask k
/// (elements are zeroed out when the corresponding mask bit is not set).
/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_epi32)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
pub unsafe fn _mm_maskz_load_epi32(k: __mmask8, mem_addr: *const i32) -> __m128i {
    let mut dst: __m128i;
    asm!(
        vpl!("vmovdqa32 {dst}{{{k}}} {{z}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = out(xmm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load packed 64-bit integers from memory into dst using writemask k
/// (elements are copied from src when the corresponding mask bit is not set).
/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_epi64)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
pub unsafe fn _mm_mask_load_epi64(src: __m128i, k: __mmask8, mem_addr: *const i64) -> __m128i {
    let mut dst: __m128i = src;
    asm!(
        vpl!("vmovdqa64 {dst}{{{k}}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = inout(xmm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load packed 64-bit integers from memory into dst using zeromask k
/// (elements are zeroed out when the corresponding mask bit is not set).
/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_epi64)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
pub unsafe fn _mm_maskz_load_epi64(k: __mmask8, mem_addr: *const i64) -> __m128i {
    let mut dst: __m128i;
    asm!(
        vpl!("vmovdqa64 {dst}{{{k}}} {{z}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = out(xmm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load packed single-precision (32-bit) floating-point elements from memory into dst using writemask k
/// (elements are copied from src when the corresponding mask bit is not set).
/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_ps)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
pub unsafe fn _mm_mask_load_ps(src: __m128, k: __mmask8, mem_addr: *const f32) -> __m128 {
    let mut dst: __m128 = src;
    asm!(
        vpl!("vmovaps {dst}{{{k}}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = inout(xmm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load packed single-precision (32-bit) floating-point elements from memory into dst using zeromask k
/// (elements are zeroed out when the corresponding mask bit is not set).
/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_ps)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
pub unsafe fn _mm_maskz_load_ps(k: __mmask8, mem_addr: *const f32) -> __m128 {
    let mut dst: __m128;
    asm!(
        vpl!("vmovaps {dst}{{{k}}} {{z}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = out(xmm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load packed double-precision (64-bit) floating-point elements from memory into dst using writemask k
/// (elements are copied from src when the corresponding mask bit is not set).
/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_load_pd)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
pub unsafe fn _mm_mask_load_pd(src: __m128d, k: __mmask8, mem_addr: *const f64) -> __m128d {
    let mut dst: __m128d = src;
    asm!(
        vpl!("vmovapd {dst}{{{k}}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = inout(xmm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load packed double-precision (64-bit) floating-point elements from memory into dst using zeromask k
/// (elements are zeroed out when the corresponding mask bit is not set).
/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_load_pd)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
pub unsafe fn _mm_maskz_load_pd(k: __mmask8, mem_addr: *const f64) -> __m128d {
    let mut dst: __m128d;
    asm!(
        vpl!("vmovapd {dst}{{{k}}} {{z}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = out(xmm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Store packed 32-bit integers from a into memory using writemask k.
/// mem_addr does not need to be aligned on any particular boundary.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_storeu_epi32)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_mask_storeu_epi32(mem_addr: *mut i32, mask: __mmask16, a: __m512i) {
    asm!(
        vps!("vmovdqu32", "{{{mask}}}, {a}"),
        p = in(reg) mem_addr,
        mask = in(kreg) mask,
        a = in(zmm_reg) a,
        options(nostack)
    );
}

/// Store packed 64-bit integers from a into memory using writemask k.
/// mem_addr does not need to be aligned on any particular boundary.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_storeu_epi64)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_mask_storeu_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m512i) {
    asm!(
        vps!("vmovdqu64", "{{{mask}}}, {a}"),
        p = in(reg) mem_addr,
        mask = in(kreg) mask,
        a = in(zmm_reg) a,
        options(nostack)
    );
}

/// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k.
/// mem_addr does not need to be aligned on any particular boundary.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_storeu_ps)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_mask_storeu_ps(mem_addr: *mut f32, mask: __mmask16, a: __m512) {
    asm!(
        vps!("vmovups", "{{{mask}}}, {a}"),
        p = in(reg) mem_addr,
        mask = in(kreg) mask,
        a = in(zmm_reg) a,
        options(nostack)
    );
}

/// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k.
/// mem_addr does not need to be aligned on any particular boundary.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_storeu_pd)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_mask_storeu_pd(mem_addr: *mut f64, mask: __mmask8, a: __m512d) {
    asm!(
        vps!("vmovupd", "{{{mask}}}, {a}"),
        p = in(reg) mem_addr,
        mask = in(kreg) mask,
        a = in(zmm_reg) a,
        options(nostack)
    );
}

/// Store packed 32-bit integers from a into memory using writemask k.
/// mem_addr does not need to be aligned on any particular boundary.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_storeu_epi32)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx")]
pub unsafe fn _mm256_mask_storeu_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m256i) {
    asm!(
        vps!("vmovdqu32", "{{{mask}}}, {a}"),
        p = in(reg) mem_addr,
        mask = in(kreg) mask,
        a = in(ymm_reg) a,
        options(nostack)
    );
}

/// Store packed 64-bit integers from a into memory using writemask k.
/// mem_addr does not need to be aligned on any particular boundary.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_storeu_epi64)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx")]
pub unsafe fn _mm256_mask_storeu_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m256i) {
    asm!(
        vps!("vmovdqu64", "{{{mask}}}, {a}"),
        p = in(reg) mem_addr,
        mask = in(kreg) mask,
        a = in(ymm_reg) a,
        options(nostack)
    );
}

/// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k.
/// mem_addr does not need to be aligned on any particular boundary.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_storeu_ps)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx")]
pub unsafe fn _mm256_mask_storeu_ps(mem_addr: *mut f32, mask: __mmask8, a: __m256) {
    asm!(
        vps!("vmovups", "{{{mask}}}, {a}"),
        p = in(reg) mem_addr,
        mask = in(kreg) mask,
        a = in(ymm_reg) a,
        options(nostack)
    );
}

/// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k.
/// mem_addr does not need to be aligned on any particular boundary.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_storeu_pd)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx")]
pub unsafe fn _mm256_mask_storeu_pd(mem_addr: *mut f64, mask: __mmask8, a: __m256d) {
    asm!(
        vps!("vmovupd", "{{{mask}}}, {a}"),
        p = in(reg) mem_addr,
        mask = in(kreg) mask,
        a = in(ymm_reg) a,
        options(nostack)
    );
}

/// Store packed 32-bit integers from a into memory using writemask k.
/// mem_addr does not need to be aligned on any particular boundary.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_storeu_epi32)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
pub unsafe fn _mm_mask_storeu_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m128i) {
    asm!(
        vps!("vmovdqu32", "{{{mask}}}, {a}"),
        p = in(reg) mem_addr,
        mask = in(kreg) mask,
        a = in(xmm_reg) a,
        options(nostack)
    );
}

/// Store packed 64-bit integers from a into memory using writemask k.
/// mem_addr does not need to be aligned on any particular boundary.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_storeu_epi64)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
pub unsafe fn _mm_mask_storeu_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m128i) {
    asm!(
        vps!("vmovdqu64", "{{{mask}}}, {a}"),
        p = in(reg) mem_addr,
        mask = in(kreg) mask,
        a = in(xmm_reg) a,
        options(nostack)
    );
}

/// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k.
/// mem_addr does not need to be aligned on any particular boundary.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_storeu_ps)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
pub unsafe fn _mm_mask_storeu_ps(mem_addr: *mut f32, mask: __mmask8, a: __m128) {
    asm!(
        vps!("vmovups", "{{{mask}}}, {a}"),
        p = in(reg) mem_addr,
        mask = in(kreg) mask,
        a = in(xmm_reg) a,
        options(nostack)
    );
}

/// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k.
/// mem_addr does not need to be aligned on any particular boundary.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_storeu_pd)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
pub unsafe fn _mm_mask_storeu_pd(mem_addr: *mut f64, mask: __mmask8, a: __m128d) {
    asm!(
        vps!("vmovupd", "{{{mask}}}, {a}"),
        p = in(reg) mem_addr,
        mask = in(kreg) mask,
        a = in(xmm_reg) a,
        options(nostack)
    );
}

/// Store packed 32-bit integers from a into memory using writemask k.
/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_store_epi32)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_mask_store_epi32(mem_addr: *mut i32, mask: __mmask16, a: __m512i) {
    asm!(
        vps!("vmovdqa32", "{{{mask}}}, {a}"),
        p = in(reg) mem_addr,
        mask = in(kreg) mask,
        a = in(zmm_reg) a,
        options(nostack)
    );
}

/// Store packed 64-bit integers from a into memory using writemask k.
/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_store_epi64)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_mask_store_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m512i) {
    asm!(
        vps!("vmovdqa64", "{{{mask}}}, {a}"),
        p = in(reg) mem_addr,
        mask = in(kreg) mask,
        a = in(zmm_reg) a,
        options(nostack)
    );
}

/// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k.
/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_store_ps)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_mask_store_ps(mem_addr: *mut f32, mask: __mmask16, a: __m512) {
    asm!(
        vps!("vmovaps", "{{{mask}}}, {a}"),
        p = in(reg) mem_addr,
        mask = in(kreg) mask,
        a = in(zmm_reg) a,
        options(nostack)
    );
}

/// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k.
/// mem_addr must be aligned on a 64-byte boundary or a general-protection exception may be generated.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_store_pd)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_mask_store_pd(mem_addr: *mut f64, mask: __mmask8, a: __m512d) {
    asm!(
        vps!("vmovapd", "{{{mask}}}, {a}"),
        p = in(reg) mem_addr,
        mask = in(kreg) mask,
        a = in(zmm_reg) a,
        options(nostack)
    );
}

/// Store packed 32-bit integers from a into memory using writemask k.
/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_store_epi32)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx")]
pub unsafe fn _mm256_mask_store_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m256i) {
    asm!(
        vps!("vmovdqa32", "{{{mask}}}, {a}"),
        p = in(reg) mem_addr,
        mask = in(kreg) mask,
        a = in(ymm_reg) a,
        options(nostack)
    );
}

/// Store packed 64-bit integers from a into memory using writemask k.
/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_store_epi64)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx")]
pub unsafe fn _mm256_mask_store_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m256i) {
    asm!(
        vps!("vmovdqa64", "{{{mask}}}, {a}"),
        p = in(reg) mem_addr,
        mask = in(kreg) mask,
        a = in(ymm_reg) a,
        options(nostack)
    );
}

/// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k.
/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_store_ps)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx")]
pub unsafe fn _mm256_mask_store_ps(mem_addr: *mut f32, mask: __mmask8, a: __m256) {
    asm!(
        vps!("vmovaps", "{{{mask}}}, {a}"),
        p = in(reg) mem_addr,
        mask = in(kreg) mask,
        a = in(ymm_reg) a,
        options(nostack)
    );
}

/// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k.
/// mem_addr must be aligned on a 32-byte boundary or a general-protection exception may be generated.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_store_pd)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx")]
pub unsafe fn _mm256_mask_store_pd(mem_addr: *mut f64, mask: __mmask8, a: __m256d) {
    asm!(
        vps!("vmovapd", "{{{mask}}}, {a}"),
        p = in(reg) mem_addr,
        mask = in(kreg) mask,
        a = in(ymm_reg) a,
        options(nostack)
    );
}

/// Store packed 32-bit integers from a into memory using writemask k.
/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_epi32)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
pub unsafe fn _mm_mask_store_epi32(mem_addr: *mut i32, mask: __mmask8, a: __m128i) {
    asm!(
        vps!("vmovdqa32", "{{{mask}}}, {a}"),
        p = in(reg) mem_addr,
        mask = in(kreg) mask,
        a = in(xmm_reg) a,
        options(nostack)
    );
}

/// Store packed 64-bit integers from a into memory using writemask k.
/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_epi64)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
pub unsafe fn _mm_mask_store_epi64(mem_addr: *mut i64, mask: __mmask8, a: __m128i) {
    asm!(
        vps!("vmovdqa64", "{{{mask}}}, {a}"),
        p = in(reg) mem_addr,
        mask = in(kreg) mask,
        a = in(xmm_reg) a,
        options(nostack)
    );
}

/// Store packed single-precision (32-bit) floating-point elements from a into memory using writemask k.
/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_ps)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
pub unsafe fn _mm_mask_store_ps(mem_addr: *mut f32, mask: __mmask8, a: __m128) {
    asm!(
        vps!("vmovaps", "{{{mask}}}, {a}"),
        p = in(reg) mem_addr,
        mask = in(kreg) mask,
        a = in(xmm_reg) a,
        options(nostack)
    );
}

/// Store packed double-precision (64-bit) floating-point elements from a into memory using writemask k.
/// mem_addr must be aligned on a 16-byte boundary or a general-protection exception may be generated.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_store_pd)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
pub unsafe fn _mm_mask_store_pd(mem_addr: *mut f64, mask: __mmask8, a: __m128d) {
    asm!(
        vps!("vmovapd", "{{{mask}}}, {a}"),
        p = in(reg) mem_addr,
        mask = in(kreg) mask,
        a = in(xmm_reg) a,
        options(nostack)
    );
}

/// Load contiguous active 32-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_expandloadu_epi32)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_mask_expandloadu_epi32(
    src: __m512i,
    k: __mmask16,
    mem_addr: *const i32,
) -> __m512i {
    let mut dst: __m512i = src;
    asm!(
        vpl!("vpexpandd {dst}{{{k}}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = inout(zmm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load contiguous active 32-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_expandloadu_epi32)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_maskz_expandloadu_epi32(k: __mmask16, mem_addr: *const i32) -> __m512i {
    let mut dst: __m512i;
    asm!(
        vpl!("vpexpandd {dst}{{{k}}} {{z}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = out(zmm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load contiguous active 32-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_expandloadu_epi32)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx")]
pub unsafe fn _mm256_mask_expandloadu_epi32(
    src: __m256i,
    k: __mmask8,
    mem_addr: *const i32,
) -> __m256i {
    let mut dst: __m256i = src;
    asm!(
        vpl!("vpexpandd {dst}{{{k}}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = inout(ymm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load contiguous active 32-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_expandloadu_epi32)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx")]
pub unsafe fn _mm256_maskz_expandloadu_epi32(k: __mmask8, mem_addr: *const i32) -> __m256i {
    let mut dst: __m256i;
    asm!(
        vpl!("vpexpandd {dst}{{{k}}} {{z}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = out(ymm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load contiguous active 32-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_expandloadu_epi32)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
pub unsafe fn _mm_mask_expandloadu_epi32(
    src: __m128i,
    k: __mmask8,
    mem_addr: *const i32,
) -> __m128i {
    let mut dst: __m128i = src;
    asm!(
        vpl!("vpexpandd {dst}{{{k}}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = inout(xmm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load contiguous active 32-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_expandloadu_epi32)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
pub unsafe fn _mm_maskz_expandloadu_epi32(k: __mmask8, mem_addr: *const i32) -> __m128i {
    let mut dst: __m128i;
    asm!(
        vpl!("vpexpandd {dst}{{{k}}} {{z}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = out(xmm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load contiguous active 64-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_expandloadu_epi64)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_mask_expandloadu_epi64(
    src: __m512i,
    k: __mmask8,
    mem_addr: *const i64,
) -> __m512i {
    let mut dst: __m512i = src;
    asm!(
        vpl!("vpexpandq {dst}{{{k}}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = inout(zmm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load contiguous active 64-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_expandloadu_epi64)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_maskz_expandloadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m512i {
    let mut dst: __m512i;
    asm!(
        vpl!("vpexpandq {dst}{{{k}}} {{z}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = out(zmm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load contiguous active 64-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_expandloadu_epi64)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx")]
pub unsafe fn _mm256_mask_expandloadu_epi64(
    src: __m256i,
    k: __mmask8,
    mem_addr: *const i64,
) -> __m256i {
    let mut dst: __m256i = src;
    asm!(
        vpl!("vpexpandq {dst}{{{k}}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = inout(ymm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load contiguous active 64-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_expandloadu_epi64)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx")]
pub unsafe fn _mm256_maskz_expandloadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m256i {
    let mut dst: __m256i;
    asm!(
        vpl!("vpexpandq {dst}{{{k}}} {{z}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = out(ymm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load contiguous active 64-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_expandloadu_epi64)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
pub unsafe fn _mm_mask_expandloadu_epi64(
    src: __m128i,
    k: __mmask8,
    mem_addr: *const i64,
) -> __m128i {
    let mut dst: __m128i = src;
    asm!(
        vpl!("vpexpandq {dst}{{{k}}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = inout(xmm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load contiguous active 64-bit integers from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_expandloadu_epi64)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
pub unsafe fn _mm_maskz_expandloadu_epi64(k: __mmask8, mem_addr: *const i64) -> __m128i {
    let mut dst: __m128i;
    asm!(
        vpl!("vpexpandq {dst}{{{k}}} {{z}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = out(xmm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_expandloadu_ps)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_mask_expandloadu_ps(
    src: __m512,
    k: __mmask16,
    mem_addr: *const f32,
) -> __m512 {
    let mut dst: __m512 = src;
    asm!(
        vpl!("vexpandps {dst}{{{k}}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = inout(zmm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_expandloadu_ps)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_maskz_expandloadu_ps(k: __mmask16, mem_addr: *const f32) -> __m512 {
    let mut dst: __m512;
    asm!(
        vpl!("vexpandps {dst}{{{k}}} {{z}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = out(zmm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_expandloadu_ps)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx")]
pub unsafe fn _mm256_mask_expandloadu_ps(src: __m256, k: __mmask8, mem_addr: *const f32) -> __m256 {
    let mut dst: __m256 = src;
    asm!(
        vpl!("vexpandps {dst}{{{k}}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = inout(ymm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_expandloadu_ps)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx")]
pub unsafe fn _mm256_maskz_expandloadu_ps(k: __mmask8, mem_addr: *const f32) -> __m256 {
    let mut dst: __m256;
    asm!(
        vpl!("vexpandps {dst}{{{k}}} {{z}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = out(ymm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_expandloadu_ps)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
pub unsafe fn _mm_mask_expandloadu_ps(src: __m128, k: __mmask8, mem_addr: *const f32) -> __m128 {
    let mut dst: __m128 = src;
    asm!(
        vpl!("vexpandps {dst}{{{k}}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = inout(xmm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load contiguous active single-precision (32-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_expandloadu_ps)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
pub unsafe fn _mm_maskz_expandloadu_ps(k: __mmask8, mem_addr: *const f32) -> __m128 {
    let mut dst: __m128;
    asm!(
        vpl!("vexpandps {dst}{{{k}}} {{z}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = out(xmm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load contiguous active single-precision (64-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_expandloadu_pd)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_mask_expandloadu_pd(
    src: __m512d,
    k: __mmask8,
    mem_addr: *const f64,
) -> __m512d {
    let mut dst: __m512d = src;
    asm!(
        vpl!("vexpandpd {dst}{{{k}}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = inout(zmm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load contiguous active single-precision (64-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_expandloadu_pd)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_maskz_expandloadu_pd(k: __mmask8, mem_addr: *const f64) -> __m512d {
    let mut dst: __m512d;
    asm!(
        vpl!("vexpandpd {dst}{{{k}}} {{z}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = out(zmm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load contiguous active single-precision (64-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_expandloadu_pd)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx")]
pub unsafe fn _mm256_mask_expandloadu_pd(
    src: __m256d,
    k: __mmask8,
    mem_addr: *const f64,
) -> __m256d {
    let mut dst: __m256d = src;
    asm!(
        vpl!("vexpandpd {dst}{{{k}}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = inout(ymm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load contiguous active single-precision (64-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_expandloadu_pd)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx")]
pub unsafe fn _mm256_maskz_expandloadu_pd(k: __mmask8, mem_addr: *const f64) -> __m256d {
    let mut dst: __m256d;
    asm!(
        vpl!("vexpandpd {dst}{{{k}}} {{z}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = out(ymm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load contiguous active single-precision (64-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_expandloadu_pd)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
pub unsafe fn _mm_mask_expandloadu_pd(src: __m128d, k: __mmask8, mem_addr: *const f64) -> __m128d {
    let mut dst: __m128d = src;
    asm!(
        vpl!("vexpandpd {dst}{{{k}}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = inout(xmm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Load contiguous active single-precision (64-bit) floating-point elements from unaligned memory at mem_addr (those with their respective bit set in mask k), and store the results in dst using zeromask k (elements are zeroed out when the corresponding mask bit is not set).
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_expandloadu_pd)
#[inline]
#[target_feature(enable = "avx512f,avx512vl,avx,sse")]
pub unsafe fn _mm_maskz_expandloadu_pd(k: __mmask8, mem_addr: *const f64) -> __m128d {
    let mut dst: __m128d;
    asm!(
        vpl!("vexpandpd {dst}{{{k}}} {{z}}"),
        p = in(reg) mem_addr,
        k = in(kreg) k,
        dst = out(xmm_reg) dst,
        options(pure, readonly, nostack)
    );
    dst
}

/// Set packed double-precision (64-bit) floating-point elements in dst with the supplied values in reverse order.
///
/// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_setr_pd&expand=5002)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_setr_pd(
    e0: f64,
    e1: f64,
    e2: f64,
    e3: f64,
    e4: f64,
    e5: f64,
    e6: f64,
    e7: f64,
) -> __m512d {
    let r = f64x8::new(e0, e1, e2, e3, e4, e5, e6, e7);
    transmute(r)
}

/// Set packed double-precision (64-bit) floating-point elements in dst with the supplied values.
///
/// [Intel's documentation]( https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_set_pd&expand=4924)
#[inline]
#[target_feature(enable = "avx512f")]
pub unsafe fn _mm512_set_pd(
    e0: f64,
    e1: f64,
    e2: f64,
    e3: f64,
    e4: f64,
    e5: f64,
    e6: f64,
    e7: f64,
) -> __m512d {
    _mm512_setr_pd(e7, e6, e5, e4, e3, e2, e1, e0)
}

/// Move the lower single-precision (32-bit) floating-point element from b to the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_move_ss&expand=3832)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmovss))]
pub unsafe fn _mm_mask_move_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
    let extractsrc: f32 = simd_extract(src, 0);
    let mut mov: f32 = extractsrc;
    if (k & 0b00000001) != 0 {
        mov = simd_extract(b, 0);
    }
    let r = simd_insert(a, 0, mov);
    transmute(r)
}

/// Move the lower single-precision (32-bit) floating-point element from b to the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_move_ss&expand=3833)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmovss))]
pub unsafe fn _mm_maskz_move_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
    let mut mov: f32 = 0.;
    if (k & 0b00000001) != 0 {
        mov = simd_extract(b, 0);
    }
    let r = simd_insert(a, 0, mov);
    transmute(r)
}

/// Move the lower double-precision (64-bit) floating-point element from b to the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_move_sd&expand=3829)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmovsd))]
pub unsafe fn _mm_mask_move_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
    let extractsrc: f64 = simd_extract(src, 0);
    let mut mov: f64 = extractsrc;
    if (k & 0b00000001) != 0 {
        mov = simd_extract(b, 0);
    }
    let r = simd_insert(a, 0, mov);
    transmute(r)
}

/// Move the lower double-precision (64-bit) floating-point element from b to the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_move_sd&expand=3830)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmovsd))]
pub unsafe fn _mm_maskz_move_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
    let mut mov: f64 = 0.;
    if (k & 0b00000001) != 0 {
        mov = simd_extract(b, 0);
    }
    let r = simd_insert(a, 0, mov);
    transmute(r)
}

/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_ss&expand=159)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vaddss))]
pub unsafe fn _mm_mask_add_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
    let extractsrc: f32 = simd_extract(src, 0);
    let mut add: f32 = extractsrc;
    if (k & 0b00000001) != 0 {
        let extracta: f32 = simd_extract(a, 0);
        let extractb: f32 = simd_extract(b, 0);
        add = extracta + extractb;
    }
    let r = simd_insert(a, 0, add);
    transmute(r)
}

/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_add_ss&expand=160)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vaddss))]
pub unsafe fn _mm_maskz_add_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
    let mut add: f32 = 0.;
    if (k & 0b00000001) != 0 {
        let extracta: f32 = simd_extract(a, 0);
        let extractb: f32 = simd_extract(b, 0);
        add = extracta + extractb;
    }
    let r = simd_insert(a, 0, add);
    transmute(r)
}

/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_sd&expand=155)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vaddsd))]
pub unsafe fn _mm_mask_add_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
    let extractsrc: f64 = simd_extract(src, 0);
    let mut add: f64 = extractsrc;
    if (k & 0b00000001) != 0 {
        let extracta: f64 = simd_extract(a, 0);
        let extractb: f64 = simd_extract(b, 0);
        add = extracta + extractb;
    }
    let r = simd_insert(a, 0, add);
    transmute(r)
}

/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_add_sd&expand=156)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vaddsd))]
pub unsafe fn _mm_maskz_add_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
    let mut add: f64 = 0.;
    if (k & 0b00000001) != 0 {
        let extracta: f64 = simd_extract(a, 0);
        let extractb: f64 = simd_extract(b, 0);
        add = extracta + extractb;
    }
    let r = simd_insert(a, 0, add);
    transmute(r)
}

/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sub_ss&expand=5750)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vsubss))]
pub unsafe fn _mm_mask_sub_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
    let extractsrc: f32 = simd_extract(src, 0);
    let mut add: f32 = extractsrc;
    if (k & 0b00000001) != 0 {
        let extracta: f32 = simd_extract(a, 0);
        let extractb: f32 = simd_extract(b, 0);
        add = extracta - extractb;
    }
    let r = simd_insert(a, 0, add);
    transmute(r)
}

/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sub_ss&expand=5751)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vsubss))]
pub unsafe fn _mm_maskz_sub_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
    let mut add: f32 = 0.;
    if (k & 0b00000001) != 0 {
        let extracta: f32 = simd_extract(a, 0);
        let extractb: f32 = simd_extract(b, 0);
        add = extracta - extractb;
    }
    let r = simd_insert(a, 0, add);
    transmute(r)
}

/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sub_sd&expand=5746)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vsubsd))]
pub unsafe fn _mm_mask_sub_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
    let extractsrc: f64 = simd_extract(src, 0);
    let mut add: f64 = extractsrc;
    if (k & 0b00000001) != 0 {
        let extracta: f64 = simd_extract(a, 0);
        let extractb: f64 = simd_extract(b, 0);
        add = extracta - extractb;
    }
    let r = simd_insert(a, 0, add);
    transmute(r)
}

/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sub_sd&expand=5747)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vsubsd))]
pub unsafe fn _mm_maskz_sub_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
    let mut add: f64 = 0.;
    if (k & 0b00000001) != 0 {
        let extracta: f64 = simd_extract(a, 0);
        let extractb: f64 = simd_extract(b, 0);
        add = extracta - extractb;
    }
    let r = simd_insert(a, 0, add);
    transmute(r)
}

/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_mul_ss&expand=3950)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmulss))]
pub unsafe fn _mm_mask_mul_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
    let extractsrc: f32 = simd_extract(src, 0);
    let mut add: f32 = extractsrc;
    if (k & 0b00000001) != 0 {
        let extracta: f32 = simd_extract(a, 0);
        let extractb: f32 = simd_extract(b, 0);
        add = extracta * extractb;
    }
    let r = simd_insert(a, 0, add);
    transmute(r)
}

/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_mul_ss&expand=3951)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmulss))]
pub unsafe fn _mm_maskz_mul_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
    let mut add: f32 = 0.;
    if (k & 0b00000001) != 0 {
        let extracta: f32 = simd_extract(a, 0);
        let extractb: f32 = simd_extract(b, 0);
        add = extracta * extractb;
    }
    let r = simd_insert(a, 0, add);
    transmute(r)
}

/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_mul_sd&expand=3947)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmulsd))]
pub unsafe fn _mm_mask_mul_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
    let extractsrc: f64 = simd_extract(src, 0);
    let mut add: f64 = extractsrc;
    if (k & 0b00000001) != 0 {
        let extracta: f64 = simd_extract(a, 0);
        let extractb: f64 = simd_extract(b, 0);
        add = extracta * extractb;
    }
    let r = simd_insert(a, 0, add);
    transmute(r)
}

/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_mul_sd&expand=3948)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmulsd))]
pub unsafe fn _mm_maskz_mul_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
    let mut add: f64 = 0.;
    if (k & 0b00000001) != 0 {
        let extracta: f64 = simd_extract(a, 0);
        let extractb: f64 = simd_extract(b, 0);
        add = extracta * extractb;
    }
    let r = simd_insert(a, 0, add);
    transmute(r)
}

/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_div_ss&expand=2181)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vdivss))]
pub unsafe fn _mm_mask_div_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
    let extractsrc: f32 = simd_extract(src, 0);
    let mut add: f32 = extractsrc;
    if (k & 0b00000001) != 0 {
        let extracta: f32 = simd_extract(a, 0);
        let extractb: f32 = simd_extract(b, 0);
        add = extracta / extractb;
    }
    let r = simd_insert(a, 0, add);
    transmute(r)
}

/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_div_ss&expand=2182)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vdivss))]
pub unsafe fn _mm_maskz_div_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
    let mut add: f32 = 0.;
    if (k & 0b00000001) != 0 {
        let extracta: f32 = simd_extract(a, 0);
        let extractb: f32 = simd_extract(b, 0);
        add = extracta / extractb;
    }
    let r = simd_insert(a, 0, add);
    transmute(r)
}

/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_div_sd&expand=2178)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vdivsd))]
pub unsafe fn _mm_mask_div_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
    let extractsrc: f64 = simd_extract(src, 0);
    let mut add: f64 = extractsrc;
    if (k & 0b00000001) != 0 {
        let extracta: f64 = simd_extract(a, 0);
        let extractb: f64 = simd_extract(b, 0);
        add = extracta / extractb;
    }
    let r = simd_insert(a, 0, add);
    transmute(r)
}

/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_div_sd&expand=2179)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vdivsd))]
pub unsafe fn _mm_maskz_div_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
    let mut add: f64 = 0.;
    if (k & 0b00000001) != 0 {
        let extracta: f64 = simd_extract(a, 0);
        let extractb: f64 = simd_extract(b, 0);
        add = extracta / extractb;
    }
    let r = simd_insert(a, 0, add);
    transmute(r)
}

/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_max_ss&expand=3672)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmaxss))]
pub unsafe fn _mm_mask_max_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
    transmute(vmaxss(
        a.as_f32x4(),
        b.as_f32x4(),
        src.as_f32x4(),
        k,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_max_ss&expand=3673)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmaxss))]
pub unsafe fn _mm_maskz_max_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
    transmute(vmaxss(
        a.as_f32x4(),
        b.as_f32x4(),
        _mm_setzero_ps().as_f32x4(),
        k,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_max_sd&expand=3669)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmaxsd))]
pub unsafe fn _mm_mask_max_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
    transmute(vmaxsd(
        a.as_f64x2(),
        b.as_f64x2(),
        src.as_f64x2(),
        k,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_max_sd&expand=3670)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmaxsd))]
pub unsafe fn _mm_maskz_max_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
    transmute(vmaxsd(
        a.as_f64x2(),
        b.as_f64x2(),
        _mm_setzero_pd().as_f64x2(),
        k,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_min_ss&expand=3786)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vminss))]
pub unsafe fn _mm_mask_min_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
    transmute(vminss(
        a.as_f32x4(),
        b.as_f32x4(),
        src.as_f32x4(),
        k,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_min_ss&expand=3787)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vminss))]
pub unsafe fn _mm_maskz_min_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
    transmute(vminss(
        a.as_f32x4(),
        b.as_f32x4(),
        _mm_setzero_ps().as_f32x4(),
        k,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_min_sd&expand=3783)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vminsd))]
pub unsafe fn _mm_mask_min_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
    transmute(vminsd(
        a.as_f64x2(),
        b.as_f64x2(),
        src.as_f64x2(),
        k,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_min_sd&expand=3784)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vminsd))]
pub unsafe fn _mm_maskz_min_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
    transmute(vminsd(
        a.as_f64x2(),
        b.as_f64x2(),
        _mm_setzero_pd().as_f64x2(),
        k,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_ss&expand=5387)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vsqrtss))]
pub unsafe fn _mm_mask_sqrt_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
    transmute(vsqrtss(
        a.as_f32x4(),
        b.as_f32x4(),
        src.as_f32x4(),
        k,
        _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
    ))
}

/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sqrt_ss&expand=5388)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vsqrtss))]
pub unsafe fn _mm_maskz_sqrt_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
    transmute(vsqrtss(
        a.as_f32x4(),
        b.as_f32x4(),
        _mm_setzero_ps().as_f32x4(),
        k,
        _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
    ))
}

/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_sd&expand=5384)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vsqrtsd))]
pub unsafe fn _mm_mask_sqrt_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
    transmute(vsqrtsd(
        a.as_f64x2(),
        b.as_f64x2(),
        src.as_f64x2(),
        k,
        _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
    ))
}

/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sqrt_sd&expand=5385)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vsqrtsd))]
pub unsafe fn _mm_maskz_sqrt_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
    transmute(vsqrtsd(
        a.as_f64x2(),
        b.as_f64x2(),
        _mm_setzero_pd().as_f64x2(),
        k,
        _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
    ))
}

/// Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_rsqrt14_ss&expand=4825)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vrsqrt14ss))]
pub unsafe fn _mm_rsqrt14_ss(a: __m128, b: __m128) -> __m128 {
    transmute(vrsqrt14ss(
        a.as_f32x4(),
        b.as_f32x4(),
        _mm_setzero_ps().as_f32x4(),
        0b1,
    ))
}

/// Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_rsqrt14_ss&expand=4823)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vrsqrt14ss))]
pub unsafe fn _mm_mask_rsqrt14_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
    transmute(vrsqrt14ss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k))
}

/// Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_rsqrt14_ss&expand=4824)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vrsqrt14ss))]
pub unsafe fn _mm_maskz_rsqrt14_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
    transmute(vrsqrt14ss(
        a.as_f32x4(),
        b.as_f32x4(),
        _mm_setzero_ps().as_f32x4(),
        k,
    ))
}

/// Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_rsqrt14_sd&expand=4822)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vrsqrt14sd))]
pub unsafe fn _mm_rsqrt14_sd(a: __m128d, b: __m128d) -> __m128d {
    transmute(vrsqrt14sd(
        a.as_f64x2(),
        b.as_f64x2(),
        _mm_setzero_pd().as_f64x2(),
        0b1,
    ))
}

/// Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_rsqrt14_sd&expand=4820)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vrsqrt14sd))]
pub unsafe fn _mm_mask_rsqrt14_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
    transmute(vrsqrt14sd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k))
}

/// Compute the approximate reciprocal square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_rsqrt14_sd&expand=4821)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vrsqrt14sd))]
pub unsafe fn _mm_maskz_rsqrt14_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
    transmute(vrsqrt14sd(
        a.as_f64x2(),
        b.as_f64x2(),
        _mm_setzero_pd().as_f64x2(),
        k,
    ))
}

/// Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_rcp14_ss&expand=4508)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vrcp14ss))]
pub unsafe fn _mm_rcp14_ss(a: __m128, b: __m128) -> __m128 {
    transmute(vrcp14ss(
        a.as_f32x4(),
        b.as_f32x4(),
        _mm_setzero_ps().as_f32x4(),
        0b1,
    ))
}

/// Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_rcp14_ss&expand=4506)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vrcp14ss))]
pub unsafe fn _mm_mask_rcp14_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
    transmute(vrcp14ss(a.as_f32x4(), b.as_f32x4(), src.as_f32x4(), k))
}

/// Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. The maximum relative error for this approximation is less than 2^-14.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_rcp14_ss&expand=4507)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vrcp14ss))]
pub unsafe fn _mm_maskz_rcp14_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
    transmute(vrcp14ss(
        a.as_f32x4(),
        b.as_f32x4(),
        _mm_setzero_ps().as_f32x4(),
        k,
    ))
}

/// Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_rcp14_sd&expand=4505)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vrcp14sd))]
pub unsafe fn _mm_rcp14_sd(a: __m128d, b: __m128d) -> __m128d {
    transmute(vrcp14sd(
        a.as_f64x2(),
        b.as_f64x2(),
        _mm_setzero_pd().as_f64x2(),
        0b1,
    ))
}

/// Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_rcp14_sd&expand=4503)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vrcp14sd))]
pub unsafe fn _mm_mask_rcp14_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
    transmute(vrcp14sd(a.as_f64x2(), b.as_f64x2(), src.as_f64x2(), k))
}

/// Compute the approximate reciprocal of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. The maximum relative error for this approximation is less than 2^-14.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_rcp14_sd&expand=4504)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vrcp14sd))]
pub unsafe fn _mm_maskz_rcp14_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
    transmute(vrcp14sd(
        a.as_f64x2(),
        b.as_f64x2(),
        _mm_setzero_pd().as_f64x2(),
        k,
    ))
}

/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getexp_ss&expand=2862)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgetexpss))]
pub unsafe fn _mm_getexp_ss(a: __m128, b: __m128) -> __m128 {
    transmute(vgetexpss(
        a.as_f32x4(),
        b.as_f32x4(),
        _mm_setzero_ps().as_f32x4(),
        0b1,
        _MM_FROUND_NO_EXC,
    ))
}

/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getexp_ss&expand=2863)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgetexpss))]
pub unsafe fn _mm_mask_getexp_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
    transmute(vgetexpss(
        a.as_f32x4(),
        b.as_f32x4(),
        src.as_f32x4(),
        k,
        _MM_FROUND_NO_EXC,
    ))
}

/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getexp_ss&expand=2864)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgetexpss))]
pub unsafe fn _mm_maskz_getexp_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
    transmute(vgetexpss(
        a.as_f32x4(),
        b.as_f32x4(),
        _mm_setzero_ps().as_f32x4(),
        k,
        _MM_FROUND_NO_EXC,
    ))
}

/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getexp_sd&expand=2859)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgetexpsd))]
pub unsafe fn _mm_getexp_sd(a: __m128d, b: __m128d) -> __m128d {
    transmute(vgetexpsd(
        a.as_f64x2(),
        b.as_f64x2(),
        _mm_setzero_pd().as_f64x2(),
        0b1,
        _MM_FROUND_NO_EXC,
    ))
}

/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getexp_sd&expand=2860)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgetexpsd))]
pub unsafe fn _mm_mask_getexp_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
    transmute(vgetexpsd(
        a.as_f64x2(),
        b.as_f64x2(),
        src.as_f64x2(),
        k,
        _MM_FROUND_NO_EXC,
    ))
}

/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getexp_sd&expand=2861)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgetexpsd))]
pub unsafe fn _mm_maskz_getexp_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
    transmute(vgetexpsd(
        a.as_f64x2(),
        b.as_f64x2(),
        _mm_setzero_pd().as_f64x2(),
        k,
        _MM_FROUND_NO_EXC,
    ))
}

/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
///    _MM_MANT_NORM_1_2     // interval [1, 2)\
///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
/// The sign is determined by sc which can take the following values:\
///    _MM_MANT_SIGN_src     // sign = sign(src)\
///    _MM_MANT_SIGN_zero    // sign = 0\
///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getmant_ss&expand=2898)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgetmantss, NORM = 0, SIGN = 0))]
#[rustc_legacy_const_generics(2, 3)]
pub unsafe fn _mm_getmant_ss<
    const NORM: _MM_MANTISSA_NORM_ENUM,
    const SIGN: _MM_MANTISSA_SIGN_ENUM,
>(
    a: __m128,
    b: __m128,
) -> __m128 {
    static_assert_imm4!(NORM);
    static_assert_imm2!(SIGN);
    let a = a.as_f32x4();
    let b = b.as_f32x4();
    let zero = _mm_setzero_ps().as_f32x4();
    let r = vgetmantss(a, b, SIGN << 2 | NORM, zero, 0b1, _MM_FROUND_CUR_DIRECTION);
    transmute(r)
}

/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
///    _MM_MANT_NORM_1_2     // interval [1, 2)\
///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
/// The sign is determined by sc which can take the following values:\
///    _MM_MANT_SIGN_src     // sign = sign(src)\
///    _MM_MANT_SIGN_zero    // sign = 0\
///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getmant_ss&expand=2899)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgetmantss, NORM = 0, SIGN = 0))]
#[rustc_legacy_const_generics(4, 5)]
pub unsafe fn _mm_mask_getmant_ss<
    const NORM: _MM_MANTISSA_NORM_ENUM,
    const SIGN: _MM_MANTISSA_SIGN_ENUM,
>(
    src: __m128,
    k: __mmask8,
    a: __m128,
    b: __m128,
) -> __m128 {
    static_assert_imm4!(NORM);
    static_assert_imm2!(SIGN);
    let a = a.as_f32x4();
    let b = b.as_f32x4();
    let src = src.as_f32x4();
    let r = vgetmantss(a, b, SIGN << 2 | NORM, src, k, _MM_FROUND_CUR_DIRECTION);
    transmute(r)
}

/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
///    _MM_MANT_NORM_1_2     // interval [1, 2)\
///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
/// The sign is determined by sc which can take the following values:\
///    _MM_MANT_SIGN_src     // sign = sign(src)\
///    _MM_MANT_SIGN_zero    // sign = 0\
///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getmant_ss&expand=2900)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgetmantss, NORM = 0, SIGN = 0))]
#[rustc_legacy_const_generics(3, 4)]
pub unsafe fn _mm_maskz_getmant_ss<
    const NORM: _MM_MANTISSA_NORM_ENUM,
    const SIGN: _MM_MANTISSA_SIGN_ENUM,
>(
    k: __mmask8,
    a: __m128,
    b: __m128,
) -> __m128 {
    static_assert_imm4!(NORM);
    static_assert_imm2!(SIGN);
    let a = a.as_f32x4();
    let b = b.as_f32x4();
    let zero = _mm_setzero_ps().as_f32x4();
    let r = vgetmantss(a, b, SIGN << 2 | NORM, zero, k, _MM_FROUND_CUR_DIRECTION);
    transmute(r)
}

/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
///    _MM_MANT_NORM_1_2     // interval [1, 2)\
///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
/// The sign is determined by sc which can take the following values:\
///    _MM_MANT_SIGN_src     // sign = sign(src)\
///    _MM_MANT_SIGN_zero    // sign = 0\
///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getmant_sd&expand=2895)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgetmantsd, NORM = 0, SIGN = 0))]
#[rustc_legacy_const_generics(2, 3)]
pub unsafe fn _mm_getmant_sd<
    const NORM: _MM_MANTISSA_NORM_ENUM,
    const SIGN: _MM_MANTISSA_SIGN_ENUM,
>(
    a: __m128d,
    b: __m128d,
) -> __m128d {
    static_assert_imm4!(NORM);
    static_assert_imm2!(SIGN);
    let a = a.as_f64x2();
    let b = b.as_f64x2();
    let zero = _mm_setzero_pd().as_f64x2();
    let r = vgetmantsd(a, b, SIGN << 2 | NORM, zero, 0b1, _MM_FROUND_CUR_DIRECTION);
    transmute(r)
}

/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
///    _MM_MANT_NORM_1_2     // interval [1, 2)\
///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
/// The sign is determined by sc which can take the following values:\
///    _MM_MANT_SIGN_src     // sign = sign(src)\
///    _MM_MANT_SIGN_zero    // sign = 0\
///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getmant_sd&expand=2896)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgetmantsd, NORM = 0, SIGN = 0))]
#[rustc_legacy_const_generics(4, 5)]
pub unsafe fn _mm_mask_getmant_sd<
    const NORM: _MM_MANTISSA_NORM_ENUM,
    const SIGN: _MM_MANTISSA_SIGN_ENUM,
>(
    src: __m128d,
    k: __mmask8,
    a: __m128d,
    b: __m128d,
) -> __m128d {
    static_assert_imm4!(NORM);
    static_assert_imm2!(SIGN);
    let a = a.as_f64x2();
    let b = b.as_f64x2();
    let src = src.as_f64x2();
    let r = vgetmantsd(a, b, SIGN << 2 | NORM, src, k, _MM_FROUND_CUR_DIRECTION);
    transmute(r)
}

/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
///    _MM_MANT_NORM_1_2     // interval [1, 2)\
///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
/// The sign is determined by sc which can take the following values:\
///    _MM_MANT_SIGN_src     // sign = sign(src)\
///    _MM_MANT_SIGN_zero    // sign = 0\
///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getmant_sd&expand=2897)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgetmantsd, NORM = 0, SIGN = 0))]
#[rustc_legacy_const_generics(3, 4)]
pub unsafe fn _mm_maskz_getmant_sd<
    const NORM: _MM_MANTISSA_NORM_ENUM,
    const SIGN: _MM_MANTISSA_SIGN_ENUM,
>(
    k: __mmask8,
    a: __m128d,
    b: __m128d,
) -> __m128d {
    static_assert_imm4!(NORM);
    static_assert_imm2!(SIGN);
    let a = a.as_f64x2();
    let b = b.as_f64x2();
    let zero = _mm_setzero_pd().as_f64x2();
    let r = vgetmantsd(a, b, SIGN << 2 | NORM, zero, k, _MM_FROUND_CUR_DIRECTION);
    transmute(r)
}

/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
///    _MM_FROUND_TO_NEG_INF     // round down\
///    _MM_FROUND_TO_POS_INF     // round up\
///    _MM_FROUND_TO_ZERO        // truncate\
///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_roundscale_ss&expand=4802)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vrndscaless, IMM8 = 255))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm_roundscale_ss<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
    static_assert_imm8!(IMM8);
    let a = a.as_f32x4();
    let b = b.as_f32x4();
    let zero = _mm_setzero_ps().as_f32x4();
    let r = vrndscaless(a, b, zero, 0b11111111, IMM8, _MM_FROUND_CUR_DIRECTION);
    transmute(r)
}

/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
///    _MM_FROUND_TO_NEG_INF     // round down\
///    _MM_FROUND_TO_POS_INF     // round up\
///    _MM_FROUND_TO_ZERO        // truncate\
///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_roundscale_ss&expand=4800)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vrndscaless, IMM8 = 0))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_mask_roundscale_ss<const IMM8: i32>(
    src: __m128,
    k: __mmask8,
    a: __m128,
    b: __m128,
) -> __m128 {
    static_assert_imm8!(IMM8);
    let a = a.as_f32x4();
    let b = b.as_f32x4();
    let src = src.as_f32x4();
    let r = vrndscaless(a, b, src, k, IMM8, _MM_FROUND_CUR_DIRECTION);
    transmute(r)
}

/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
///    _MM_FROUND_TO_NEG_INF     // round down\
///    _MM_FROUND_TO_POS_INF     // round up\
///    _MM_FROUND_TO_ZERO        // truncate\
///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_roundscale_ss&expand=4801)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vrndscaless, IMM8 = 0))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm_maskz_roundscale_ss<const IMM8: i32>(
    k: __mmask8,
    a: __m128,
    b: __m128,
) -> __m128 {
    static_assert_imm8!(IMM8);
    let a = a.as_f32x4();
    let b = b.as_f32x4();
    let zero = _mm_setzero_ps().as_f32x4();
    let r = vrndscaless(a, b, zero, k, IMM8, _MM_FROUND_CUR_DIRECTION);
    transmute(r)
}

/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
///    _MM_FROUND_TO_NEG_INF     // round down\
///    _MM_FROUND_TO_POS_INF     // round up\
///    _MM_FROUND_TO_ZERO        // truncate\
///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_roundscale_sd&expand=4799)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vrndscalesd, IMM8 = 255))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm_roundscale_sd<const IMM8: i32>(a: __m128d, b: __m128d) -> __m128d {
    static_assert_imm8!(IMM8);
    let a = a.as_f64x2();
    let b = b.as_f64x2();
    let zero = _mm_setzero_pd().as_f64x2();
    let r = vrndscalesd(a, b, zero, 0b11111111, IMM8, _MM_FROUND_CUR_DIRECTION);
    transmute(r)
}

/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
///    _MM_FROUND_TO_NEG_INF     // round down\
///    _MM_FROUND_TO_POS_INF     // round up\
///    _MM_FROUND_TO_ZERO        // truncate\
///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_roundscale_sd&expand=4797)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vrndscalesd, IMM8 = 0))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_mask_roundscale_sd<const IMM8: i32>(
    src: __m128d,
    k: __mmask8,
    a: __m128d,
    b: __m128d,
) -> __m128d {
    static_assert_imm8!(IMM8);
    let a = a.as_f64x2();
    let b = b.as_f64x2();
    let src = src.as_f64x2();
    let r = vrndscalesd(a, b, src, k, IMM8, _MM_FROUND_CUR_DIRECTION);
    transmute(r)
}

/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
///    _MM_FROUND_TO_NEG_INF     // round down\
///    _MM_FROUND_TO_POS_INF     // round up\
///    _MM_FROUND_TO_ZERO        // truncate\
///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_roundscale_sd&expand=4798)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vrndscalesd, IMM8 = 0))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm_maskz_roundscale_sd<const IMM8: i32>(
    k: __mmask8,
    a: __m128d,
    b: __m128d,
) -> __m128d {
    static_assert_imm8!(IMM8);
    let a = a.as_f64x2();
    let b = b.as_f64x2();
    let zero = _mm_setzero_pd().as_f64x2();
    let r = vrndscalesd(a, b, zero, k, IMM8, _MM_FROUND_CUR_DIRECTION);
    transmute(r)
}

/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_scalef_ss&expand=4901)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vscalefss))]
pub unsafe fn _mm_scalef_ss(a: __m128, b: __m128) -> __m128 {
    let a = a.as_f32x4();
    let b = b.as_f32x4();
    let zero = _mm_setzero_ps().as_f32x4();
    transmute(vscalefss(a, b, zero, 0b11111111, _MM_FROUND_CUR_DIRECTION))
}

/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_scalef_ss&expand=4899)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vscalefss))]
pub unsafe fn _mm_mask_scalef_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
    let a = a.as_f32x4();
    let b = b.as_f32x4();
    let src = src.as_f32x4();
    transmute(vscalefss(a, b, src, k, _MM_FROUND_CUR_DIRECTION))
}

/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_scalef_ss&expand=4900)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vscalefss))]
pub unsafe fn _mm_maskz_scalef_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
    transmute(vscalefss(
        a.as_f32x4(),
        b.as_f32x4(),
        _mm_setzero_ps().as_f32x4(),
        k,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_scalef_sd&expand=4898)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vscalefsd))]
pub unsafe fn _mm_scalef_sd(a: __m128d, b: __m128d) -> __m128d {
    transmute(vscalefsd(
        a.as_f64x2(),
        b.as_f64x2(),
        _mm_setzero_pd().as_f64x2(),
        0b11111111,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_scalef_sd&expand=4896)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vscalefsd))]
pub unsafe fn _mm_mask_scalef_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
    transmute(vscalefsd(
        a.as_f64x2(),
        b.as_f64x2(),
        src.as_f64x2(),
        k,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_scalef_sd&expand=4897)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vscalefsd))]
pub unsafe fn _mm_maskz_scalef_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
    transmute(vscalefsd(
        a.as_f64x2(),
        b.as_f64x2(),
        _mm_setzero_pd().as_f64x2(),
        k,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmadd_ss&expand=2582)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd213ss))]
pub unsafe fn _mm_mask_fmadd_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
    let mut fmadd: f32 = simd_extract(a, 0);
    if (k & 0b00000001) != 0 {
        let extractb: f32 = simd_extract(b, 0);
        let extractc: f32 = simd_extract(c, 0);
        fmadd = vfmadd132ss(fmadd, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
    }
    let r = simd_insert(a, 0, fmadd);
    transmute(r)
}

/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmadd_ss&expand=2584)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd213ss))]
pub unsafe fn _mm_maskz_fmadd_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
    let mut fmadd: f32 = 0.;
    if (k & 0b00000001) != 0 {
        let extracta: f32 = simd_extract(a, 0);
        let extractb: f32 = simd_extract(b, 0);
        let extractc: f32 = simd_extract(c, 0);
        fmadd = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
    }
    let r = simd_insert(a, 0, fmadd);
    transmute(r)
}

/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmadd_ss&expand=2583)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd213ss))]
pub unsafe fn _mm_mask3_fmadd_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
    let mut fmadd: f32 = simd_extract(c, 0);
    if (k & 0b00000001) != 0 {
        let extracta: f32 = simd_extract(a, 0);
        let extractb: f32 = simd_extract(b, 0);
        fmadd = vfmadd132ss(extracta, extractb, fmadd, _MM_FROUND_CUR_DIRECTION);
    }
    let r = simd_insert(c, 0, fmadd);
    transmute(r)
}

/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmadd_sd&expand=2578)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd213sd))]
pub unsafe fn _mm_mask_fmadd_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
    let mut fmadd: f64 = simd_extract(a, 0);
    if (k & 0b00000001) != 0 {
        let extractb: f64 = simd_extract(b, 0);
        let extractc: f64 = simd_extract(c, 0);
        fmadd = vfmadd132sd(fmadd, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
    }
    let r = simd_insert(a, 0, fmadd);
    transmute(r)
}

/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmadd_sd&expand=2580)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd213sd))]
pub unsafe fn _mm_maskz_fmadd_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
    let mut fmadd: f64 = 0.;
    if (k & 0b00000001) != 0 {
        let extracta: f64 = simd_extract(a, 0);
        let extractb: f64 = simd_extract(b, 0);
        let extractc: f64 = simd_extract(c, 0);
        fmadd = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
    }
    let r = simd_insert(a, 0, fmadd);
    transmute(r)
}

/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmadd_sd&expand=2579)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd213sd))]
pub unsafe fn _mm_mask3_fmadd_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
    let mut fmadd: f64 = simd_extract(c, 0);
    if (k & 0b00000001) != 0 {
        let extracta: f64 = simd_extract(a, 0);
        let extractb: f64 = simd_extract(b, 0);
        fmadd = vfmadd132sd(extracta, extractb, fmadd, _MM_FROUND_CUR_DIRECTION);
    }
    let r = simd_insert(c, 0, fmadd);
    transmute(r)
}

/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmsub_ss&expand=2668)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmsub213ss))]
pub unsafe fn _mm_mask_fmsub_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
    let mut fmsub: f32 = simd_extract(a, 0);
    if (k & 0b00000001) != 0 {
        let extractb: f32 = simd_extract(b, 0);
        let extractc: f32 = simd_extract(c, 0);
        let extractc = -extractc;
        fmsub = vfmadd132ss(fmsub, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
    }
    let r = simd_insert(a, 0, fmsub);
    transmute(r)
}

/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmsub_ss&expand=2670)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmsub213ss))]
pub unsafe fn _mm_maskz_fmsub_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
    let mut fmsub: f32 = 0.;
    if (k & 0b00000001) != 0 {
        let extracta: f32 = simd_extract(a, 0);
        let extractb: f32 = simd_extract(b, 0);
        let extractc: f32 = simd_extract(c, 0);
        let extractc = -extractc;
        fmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
    }
    let r = simd_insert(a, 0, fmsub);
    transmute(r)
}

/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmsub_ss&expand=2669)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmsub213ss))]
pub unsafe fn _mm_mask3_fmsub_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
    let mut fmsub: f32 = simd_extract(c, 0);
    if (k & 0b00000001) != 0 {
        let extracta: f32 = simd_extract(a, 0);
        let extractb: f32 = simd_extract(b, 0);
        let extractc = -fmsub;
        fmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
    }
    let r = simd_insert(c, 0, fmsub);
    transmute(r)
}

/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmsub_sd&expand=2664)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmsub213sd))]
pub unsafe fn _mm_mask_fmsub_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
    let mut fmsub: f64 = simd_extract(a, 0);
    if (k & 0b00000001) != 0 {
        let extractb: f64 = simd_extract(b, 0);
        let extractc: f64 = simd_extract(c, 0);
        let extractc = -extractc;
        fmsub = vfmadd132sd(fmsub, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
    }
    let r = simd_insert(a, 0, fmsub);
    transmute(r)
}

/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmsub_sd&expand=2666)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmsub213sd))]
pub unsafe fn _mm_maskz_fmsub_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
    let mut fmsub: f64 = 0.;
    if (k & 0b00000001) != 0 {
        let extracta: f64 = simd_extract(a, 0);
        let extractb: f64 = simd_extract(b, 0);
        let extractc: f64 = simd_extract(c, 0);
        let extractc = -extractc;
        fmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
    }
    let r = simd_insert(a, 0, fmsub);
    transmute(r)
}

/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmsub_sd&expand=2665)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmsub213sd))]
pub unsafe fn _mm_mask3_fmsub_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
    let mut fmsub: f64 = simd_extract(c, 0);
    if (k & 0b00000001) != 0 {
        let extracta: f64 = simd_extract(a, 0);
        let extractb: f64 = simd_extract(b, 0);
        let extractc = -fmsub;
        fmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
    }
    let r = simd_insert(c, 0, fmsub);
    transmute(r)
}

/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fnmadd_ss&expand=2748)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfnmadd213ss))]
pub unsafe fn _mm_mask_fnmadd_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
    let mut fnmadd: f32 = simd_extract(a, 0);
    if (k & 0b00000001) != 0 {
        let extracta = -fnmadd;
        let extractb: f32 = simd_extract(b, 0);
        let extractc: f32 = simd_extract(c, 0);
        fnmadd = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
    }
    let r = simd_insert(a, 0, fnmadd);
    transmute(r)
}

/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fnmadd_ss&expand=2750)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfnmadd213ss))]
pub unsafe fn _mm_maskz_fnmadd_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
    let mut fnmadd: f32 = 0.;
    if (k & 0b00000001) != 0 {
        let extracta: f32 = simd_extract(a, 0);
        let extracta = -extracta;
        let extractb: f32 = simd_extract(b, 0);
        let extractc: f32 = simd_extract(c, 0);
        fnmadd = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
    }
    let r = simd_insert(a, 0, fnmadd);
    transmute(r)
}

/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fnmadd_ss&expand=2749)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfnmadd213ss))]
pub unsafe fn _mm_mask3_fnmadd_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
    let mut fnmadd: f32 = simd_extract(c, 0);
    if (k & 0b00000001) != 0 {
        let extracta: f32 = simd_extract(a, 0);
        let extracta = -extracta;
        let extractb: f32 = simd_extract(b, 0);
        fnmadd = vfmadd132ss(extracta, extractb, fnmadd, _MM_FROUND_CUR_DIRECTION);
    }
    let r = simd_insert(c, 0, fnmadd);
    transmute(r)
}

/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fnmadd_sd&expand=2744)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfnmadd213sd))]
pub unsafe fn _mm_mask_fnmadd_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
    let mut fnmadd: f64 = simd_extract(a, 0);
    if (k & 0b00000001) != 0 {
        let extracta = -fnmadd;
        let extractb: f64 = simd_extract(b, 0);
        let extractc: f64 = simd_extract(c, 0);
        fnmadd = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
    }
    let r = simd_insert(a, 0, fnmadd);
    transmute(r)
}

/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fnmadd_sd&expand=2746)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfnmadd213sd))]
pub unsafe fn _mm_maskz_fnmadd_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
    let mut fnmadd: f64 = 0.;
    if (k & 0b00000001) != 0 {
        let extracta: f64 = simd_extract(a, 0);
        let extracta = -extracta;
        let extractb: f64 = simd_extract(b, 0);
        let extractc: f64 = simd_extract(c, 0);
        fnmadd = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
    }
    let r = simd_insert(a, 0, fnmadd);
    transmute(r)
}

/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fnmadd_sd&expand=2745)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfnmadd213sd))]
pub unsafe fn _mm_mask3_fnmadd_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
    let mut fnmadd: f64 = simd_extract(c, 0);
    if (k & 0b00000001) != 0 {
        let extracta: f64 = simd_extract(a, 0);
        let extracta = -extracta;
        let extractb: f64 = simd_extract(b, 0);
        fnmadd = vfmadd132sd(extracta, extractb, fnmadd, _MM_FROUND_CUR_DIRECTION);
    }
    let r = simd_insert(c, 0, fnmadd);
    transmute(r)
}

/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fnmsub_ss&expand=2796)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfnmsub213ss))]
pub unsafe fn _mm_mask_fnmsub_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
    let mut fnmsub: f32 = simd_extract(a, 0);
    if (k & 0b00000001) != 0 {
        let extracta = -fnmsub;
        let extractb: f32 = simd_extract(b, 0);
        let extractc: f32 = simd_extract(c, 0);
        let extractc = -extractc;
        fnmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
    }
    let r = simd_insert(a, 0, fnmsub);
    transmute(r)
}

/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fnmsub_ss&expand=2798)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfnmsub213ss))]
pub unsafe fn _mm_maskz_fnmsub_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
    let mut fnmsub: f32 = 0.;
    if (k & 0b00000001) != 0 {
        let extracta: f32 = simd_extract(a, 0);
        let extracta = -extracta;
        let extractb: f32 = simd_extract(b, 0);
        let extractc: f32 = simd_extract(c, 0);
        let extractc = -extractc;
        fnmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
    }
    let r = simd_insert(a, 0, fnmsub);
    transmute(r)
}

/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fnmsub_ss&expand=2797)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfnmsub213ss))]
pub unsafe fn _mm_mask3_fnmsub_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
    let mut fnmsub: f32 = simd_extract(c, 0);
    if (k & 0b00000001) != 0 {
        let extracta: f32 = simd_extract(a, 0);
        let extracta = -extracta;
        let extractb: f32 = simd_extract(b, 0);
        let extractc = -fnmsub;
        fnmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
    }
    let r = simd_insert(c, 0, fnmsub);
    transmute(r)
}

/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fnmsub_sd&expand=2792)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfnmsub213sd))]
pub unsafe fn _mm_mask_fnmsub_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
    let mut fnmsub: f64 = simd_extract(a, 0);
    if (k & 0b00000001) != 0 {
        let extracta = -fnmsub;
        let extractb: f64 = simd_extract(b, 0);
        let extractc: f64 = simd_extract(c, 0);
        let extractc = -extractc;
        fnmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
    }
    let r = simd_insert(a, 0, fnmsub);
    transmute(r)
}

/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fnmsub_sd&expand=2794)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfnmsub213sd))]
pub unsafe fn _mm_maskz_fnmsub_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
    let mut fnmsub: f64 = 0.;
    if (k & 0b00000001) != 0 {
        let extracta: f64 = simd_extract(a, 0);
        let extracta = -extracta;
        let extractb: f64 = simd_extract(b, 0);
        let extractc: f64 = simd_extract(c, 0);
        let extractc = -extractc;
        fnmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
    }
    let r = simd_insert(a, 0, fnmsub);
    transmute(r)
}

/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fnmsub_sd&expand=2793)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfnmsub213sd))]
pub unsafe fn _mm_mask3_fnmsub_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
    let mut fnmsub: f64 = simd_extract(c, 0);
    if (k & 0b00000001) != 0 {
        let extracta: f64 = simd_extract(a, 0);
        let extracta = -extracta;
        let extractb: f64 = simd_extract(b, 0);
        let extractc = -fnmsub;
        fnmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
    }
    let r = simd_insert(c, 0, fnmsub);
    transmute(r)
}

/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_ss&expand=151)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vaddss, ROUNDING = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm_add_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x4();
    let b = b.as_f32x4();
    let zero = _mm_setzero_ps().as_f32x4();
    let r = vaddss(a, b, zero, 0b1, ROUNDING);
    transmute(r)
}

/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_round_ss&expand=152)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vaddss, ROUNDING = 8))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_mask_add_round_ss<const ROUNDING: i32>(
    src: __m128,
    k: __mmask8,
    a: __m128,
    b: __m128,
) -> __m128 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x4();
    let b = b.as_f32x4();
    let src = src.as_f32x4();
    let r = vaddss(a, b, src, k, ROUNDING);
    transmute(r)
}

/// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_ss&expand=153)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vaddss, ROUNDING = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm_maskz_add_round_ss<const ROUNDING: i32>(
    k: __mmask8,
    a: __m128,
    b: __m128,
) -> __m128 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x4();
    let b = b.as_f32x4();
    let zero = _mm_setzero_ps().as_f32x4();
    let r = vaddss(a, b, zero, k, ROUNDING);
    transmute(r)
}

/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_round_sd&expand=148)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vaddsd, ROUNDING = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm_add_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f64x2();
    let b = b.as_f64x2();
    let zero = _mm_setzero_pd().as_f64x2();
    let r = vaddsd(a, b, zero, 0b1, ROUNDING);
    transmute(r)
}

/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_add_round_Sd&expand=149)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vaddsd, ROUNDING = 8))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_mask_add_round_sd<const ROUNDING: i32>(
    src: __m128d,
    k: __mmask8,
    a: __m128d,
    b: __m128d,
) -> __m128d {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f64x2();
    let b = b.as_f64x2();
    let src = src.as_f64x2();
    let r = vaddsd(a, b, src, k, ROUNDING);
    transmute(r)
}

/// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_add_round_sd&expand=150)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vaddsd, ROUNDING = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm_maskz_add_round_sd<const ROUNDING: i32>(
    k: __mmask8,
    a: __m128d,
    b: __m128d,
) -> __m128d {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f64x2();
    let b = b.as_f64x2();
    let zero = _mm_setzero_pd().as_f64x2();
    let r = vaddsd(a, b, zero, k, ROUNDING);
    transmute(r)
}

/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_round_ss&expand=5745)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vsubss, ROUNDING = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm_sub_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x4();
    let b = b.as_f32x4();
    let zero = _mm_setzero_ps().as_f32x4();
    let r = vsubss(a, b, zero, 0b1, ROUNDING);
    transmute(r)
}

/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sub_round_ss&expand=5743)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vsubss, ROUNDING = 8))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_mask_sub_round_ss<const ROUNDING: i32>(
    src: __m128,
    k: __mmask8,
    a: __m128,
    b: __m128,
) -> __m128 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x4();
    let b = b.as_f32x4();
    let src = src.as_f32x4();
    let r = vsubss(a, b, src, k, ROUNDING);
    transmute(r)
}

/// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sub_round_ss&expand=5744)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vsubss, ROUNDING = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm_maskz_sub_round_ss<const ROUNDING: i32>(
    k: __mmask8,
    a: __m128,
    b: __m128,
) -> __m128 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x4();
    let b = b.as_f32x4();
    let zero = _mm_setzero_ps().as_f32x4();
    let r = vsubss(a, b, zero, k, ROUNDING);
    transmute(r)
}

/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_round_sd&expand=5742)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vsubsd, ROUNDING = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm_sub_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f64x2();
    let b = b.as_f64x2();
    let zero = _mm_setzero_pd().as_f64x2();
    let r = vsubsd(a, b, zero, 0b1, ROUNDING);
    transmute(r)
}

/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sub_round_sd&expand=5740)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vsubsd, ROUNDING = 8))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_mask_sub_round_sd<const ROUNDING: i32>(
    src: __m128d,
    k: __mmask8,
    a: __m128d,
    b: __m128d,
) -> __m128d {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f64x2();
    let b = b.as_f64x2();
    let src = src.as_f64x2();
    let r = vsubsd(a, b, src, k, ROUNDING);
    transmute(r)
}

/// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sub_round_sd&expand=5741)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vsubsd, ROUNDING = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm_maskz_sub_round_sd<const ROUNDING: i32>(
    k: __mmask8,
    a: __m128d,
    b: __m128d,
) -> __m128d {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f64x2();
    let b = b.as_f64x2();
    let zero = _mm_setzero_pd().as_f64x2();
    let r = vsubsd(a, b, zero, k, ROUNDING);
    transmute(r)
}

/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_round_ss&expand=3946)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmulss, ROUNDING = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm_mul_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x4();
    let b = b.as_f32x4();
    let zero = _mm_setzero_ps().as_f32x4();
    let r = vmulss(a, b, zero, 0b1, ROUNDING);
    transmute(r)
}

/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_mul_round_ss&expand=3944)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmulss, ROUNDING = 8))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_mask_mul_round_ss<const ROUNDING: i32>(
    src: __m128,
    k: __mmask8,
    a: __m128,
    b: __m128,
) -> __m128 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x4();
    let b = b.as_f32x4();
    let src = src.as_f32x4();
    let r = vmulss(a, b, src, k, ROUNDING);
    transmute(r)
}

/// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_mul_round_ss&expand=3945)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmulss, ROUNDING = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm_maskz_mul_round_ss<const ROUNDING: i32>(
    k: __mmask8,
    a: __m128,
    b: __m128,
) -> __m128 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x4();
    let b = b.as_f32x4();
    let zero = _mm_setzero_ps().as_f32x4();
    let r = vmulss(a, b, zero, k, ROUNDING);
    transmute(r)
}

/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_round_sd&expand=3943)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmulsd, ROUNDING = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm_mul_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f64x2();
    let b = b.as_f64x2();
    let zero = _mm_setzero_pd().as_f64x2();
    let r = vmulsd(a, b, zero, 0b1, ROUNDING);
    transmute(r)
}

/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_mul_round_sd&expand=3941)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmulsd, ROUNDING = 8))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_mask_mul_round_sd<const ROUNDING: i32>(
    src: __m128d,
    k: __mmask8,
    a: __m128d,
    b: __m128d,
) -> __m128d {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f64x2();
    let b = b.as_f64x2();
    let src = src.as_f64x2();
    let r = vmulsd(a, b, src, k, ROUNDING);
    transmute(r)
}

/// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_mul_round_sd&expand=3942)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmulsd, ROUNDING = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm_maskz_mul_round_sd<const ROUNDING: i32>(
    k: __mmask8,
    a: __m128d,
    b: __m128d,
) -> __m128d {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f64x2();
    let b = b.as_f64x2();
    let zero = _mm_setzero_pd().as_f64x2();
    let r = vmulsd(a, b, zero, k, ROUNDING);
    transmute(r)
}

/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_div_round_ss&expand=2174)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vdivss, ROUNDING = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm_div_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x4();
    let b = b.as_f32x4();
    let zero = _mm_setzero_ps().as_f32x4();
    let r = vdivss(a, b, zero, 0b1, ROUNDING);
    transmute(r)
}

/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_div_round_ss&expand=2175)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vdivss, ROUNDING = 8))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_mask_div_round_ss<const ROUNDING: i32>(
    src: __m128,
    k: __mmask8,
    a: __m128,
    b: __m128,
) -> __m128 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x4();
    let b = b.as_f32x4();
    let src = src.as_f32x4();
    let r = vdivss(a, b, src, k, ROUNDING);
    transmute(r)
}

/// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_div_round_ss&expand=2176)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vdivss, ROUNDING = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm_maskz_div_round_ss<const ROUNDING: i32>(
    k: __mmask8,
    a: __m128,
    b: __m128,
) -> __m128 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x4();
    let b = b.as_f32x4();
    let zero = _mm_setzero_ps().as_f32x4();
    let r = vdivss(a, b, zero, k, ROUNDING);
    transmute(r)
}

/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_div_round_sd&expand=2171)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vdivsd, ROUNDING = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm_div_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f64x2();
    let b = b.as_f64x2();
    let zero = _mm_setzero_pd().as_f64x2();
    let r = vdivsd(a, b, zero, 0b1, ROUNDING);
    transmute(r)
}

/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_div_round_sd&expand=2172)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vdivsd, ROUNDING = 8))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_mask_div_round_sd<const ROUNDING: i32>(
    src: __m128d,
    k: __mmask8,
    a: __m128d,
    b: __m128d,
) -> __m128d {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f64x2();
    let b = b.as_f64x2();
    let src = src.as_f64x2();
    let r = vdivsd(a, b, src, k, ROUNDING);
    transmute(r)
}

/// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_div_round_sd&expand=2173)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vdivsd, ROUNDING = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm_maskz_div_round_sd<const ROUNDING: i32>(
    k: __mmask8,
    a: __m128d,
    b: __m128d,
) -> __m128d {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f64x2();
    let b = b.as_f64x2();
    let zero = _mm_setzero_pd().as_f64x2();
    let r = vdivsd(a, b, zero, k, ROUNDING);
    transmute(r)
}

/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_max_round_ss&expand=3668)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmaxss, SAE = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm_max_round_ss<const SAE: i32>(a: __m128, b: __m128) -> __m128 {
    static_assert_sae!(SAE);
    let a = a.as_f32x4();
    let b = b.as_f32x4();
    let zero = _mm_setzero_ps().as_f32x4();
    let r = vmaxss(a, b, zero, 0b1, SAE);
    transmute(r)
}

/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_max_ss&expand=3672)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmaxss, SAE = 8))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_mask_max_round_ss<const SAE: i32>(
    src: __m128,
    k: __mmask8,
    a: __m128,
    b: __m128,
) -> __m128 {
    static_assert_sae!(SAE);
    let a = a.as_f32x4();
    let b = b.as_f32x4();
    let src = src.as_f32x4();
    let r = vmaxss(a, b, src, k, SAE);
    transmute(r)
}

/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_max_round_ss&expand=3667)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmaxss, SAE = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm_maskz_max_round_ss<const SAE: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
    static_assert_sae!(SAE);
    let a = a.as_f32x4();
    let b = b.as_f32x4();
    let zero = _mm_setzero_ps().as_f32x4();
    let r = vmaxss(a, b, zero, k, SAE);
    transmute(r)
}

/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst, and copy the upper element from a to the upper element of dst.\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_max_round_sd&expand=3665)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmaxsd, SAE = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm_max_round_sd<const SAE: i32>(a: __m128d, b: __m128d) -> __m128d {
    static_assert_sae!(SAE);
    let a = a.as_f64x2();
    let b = b.as_f64x2();
    let zero = _mm_setzero_pd().as_f64x2();
    let r = vmaxsd(a, b, zero, 0b1, SAE);
    transmute(r)
}

/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_max_round_sd&expand=3663)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmaxsd, SAE = 8))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_mask_max_round_sd<const SAE: i32>(
    src: __m128d,
    k: __mmask8,
    a: __m128d,
    b: __m128d,
) -> __m128d {
    static_assert_sae!(SAE);
    let a = a.as_f64x2();
    let b = b.as_f64x2();
    let src = src.as_f64x2();
    let r = vmaxsd(a, b, src, k, SAE);
    transmute(r)
}

/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_max_sd&expand=3670)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vmaxsd, SAE = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm_maskz_max_round_sd<const SAE: i32>(
    k: __mmask8,
    a: __m128d,
    b: __m128d,
) -> __m128d {
    static_assert_sae!(SAE);
    let a = a.as_f64x2();
    let b = b.as_f64x2();
    let zero = _mm_setzero_pd().as_f64x2();
    let r = vmaxsd(a, b, zero, k, SAE);
    transmute(r)
}

/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_min_round_ss&expand=3782)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vminss, SAE = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm_min_round_ss<const SAE: i32>(a: __m128, b: __m128) -> __m128 {
    static_assert_sae!(SAE);
    let a = a.as_f32x4();
    let b = b.as_f32x4();
    let zero = _mm_setzero_ps().as_f32x4();
    let r = vminss(a, b, zero, 0b1, SAE);
    transmute(r)
}

/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_min_round_Ss&expand=3780)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vminss, SAE = 8))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_mask_min_round_ss<const SAE: i32>(
    src: __m128,
    k: __mmask8,
    a: __m128,
    b: __m128,
) -> __m128 {
    static_assert_sae!(SAE);
    let a = a.as_f32x4();
    let b = b.as_f32x4();
    let src = src.as_f32x4();
    let r = vminss(a, b, src, k, SAE);
    transmute(r)
}

/// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_min_round_ss&expand=3781)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vminss, SAE = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm_maskz_min_round_ss<const SAE: i32>(k: __mmask8, a: __m128, b: __m128) -> __m128 {
    static_assert_sae!(SAE);
    let a = a.as_f32x4();
    let b = b.as_f32x4();
    let zero = _mm_setzero_ps().as_f32x4();
    let r = vminss(a, b, zero, k, SAE);
    transmute(r)
}

/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst , and copy the upper element from a to the upper element of dst.\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_min_round_sd&expand=3779)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vminsd, SAE = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm_min_round_sd<const SAE: i32>(a: __m128d, b: __m128d) -> __m128d {
    static_assert_sae!(SAE);
    let a = a.as_f64x2();
    let b = b.as_f64x2();
    let zero = _mm_setzero_pd().as_f64x2();
    let r = vminsd(a, b, zero, 0b1, SAE);
    transmute(r)
}

/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_min_round_sd&expand=3777)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vminsd, SAE = 8))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_mask_min_round_sd<const SAE: i32>(
    src: __m128d,
    k: __mmask8,
    a: __m128d,
    b: __m128d,
) -> __m128d {
    static_assert_sae!(SAE);
    let a = a.as_f64x2();
    let b = b.as_f64x2();
    let src = src.as_f64x2();
    let r = vminsd(a, b, src, k, SAE);
    transmute(r)
}

/// Compare the lower double-precision (64-bit) floating-point elements in a and b, store the minimum value in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_min_round_Sd&expand=3778)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vminsd, SAE = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm_maskz_min_round_sd<const SAE: i32>(
    k: __mmask8,
    a: __m128d,
    b: __m128d,
) -> __m128d {
    static_assert_sae!(SAE);
    let a = a.as_f64x2();
    let b = b.as_f64x2();
    let zero = _mm_setzero_pd().as_f64x2();
    let r = vminsd(a, b, zero, k, SAE);
    transmute(r)
}

/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sqrt_round_ss&expand=5383)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vsqrtss, ROUNDING = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm_sqrt_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x4();
    let b = b.as_f32x4();
    let zero = _mm_setzero_ps().as_f32x4();
    let r = vsqrtss(a, b, zero, 0b1, ROUNDING);
    transmute(r)
}

/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_round_ss&expand=5381)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vsqrtss, ROUNDING = 8))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_mask_sqrt_round_ss<const ROUNDING: i32>(
    src: __m128,
    k: __mmask8,
    a: __m128,
    b: __m128,
) -> __m128 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x4();
    let b = b.as_f32x4();
    let src = src.as_f32x4();
    let r = vsqrtss(a, b, src, k, ROUNDING);
    transmute(r)
}

/// Compute the square root of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sqrt_round_ss&expand=5382)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vsqrtss, ROUNDING = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm_maskz_sqrt_round_ss<const ROUNDING: i32>(
    k: __mmask8,
    a: __m128,
    b: __m128,
) -> __m128 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x4();
    let b = b.as_f32x4();
    let zero = _mm_setzero_ps().as_f32x4();
    let r = vsqrtss(a, b, zero, k, ROUNDING);
    transmute(r)
}

/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sqrt_round_sd&expand=5380)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vsqrtsd, ROUNDING = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm_sqrt_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f64x2();
    let b = b.as_f64x2();
    let zero = _mm_setzero_pd().as_f64x2();
    let r = vsqrtsd(a, b, zero, 0b1, ROUNDING);
    transmute(r)
}

/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_sqrt_round_sd&expand=5378)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vsqrtsd, ROUNDING = 8))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_mask_sqrt_round_sd<const ROUNDING: i32>(
    src: __m128d,
    k: __mmask8,
    a: __m128d,
    b: __m128d,
) -> __m128d {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f64x2();
    let b = b.as_f64x2();
    let src = src.as_f64x2();
    let r = vsqrtsd(a, b, src, k, ROUNDING);
    transmute(r)
}

/// Compute the square root of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_sqrt_round_sd&expand=5379)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vsqrtsd, ROUNDING = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm_maskz_sqrt_round_sd<const ROUNDING: i32>(
    k: __mmask8,
    a: __m128d,
    b: __m128d,
) -> __m128d {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f64x2();
    let b = b.as_f64x2();
    let zero = _mm_setzero_pd().as_f64x2();
    let r = vsqrtsd(a, b, zero, k, ROUNDING);
    transmute(r)
}

/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getexp_round_ss&expand=2856)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgetexpss, SAE = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm_getexp_round_ss<const SAE: i32>(a: __m128, b: __m128) -> __m128 {
    static_assert_sae!(SAE);
    let a = a.as_f32x4();
    let b = b.as_f32x4();
    let zero = _mm_setzero_ps().as_f32x4();
    let r = vgetexpss(a, b, zero, 0b1, SAE);
    transmute(r)
}

/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getexp_round_ss&expand=2857)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgetexpss, SAE = 8))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_mask_getexp_round_ss<const SAE: i32>(
    src: __m128,
    k: __mmask8,
    a: __m128,
    b: __m128,
) -> __m128 {
    static_assert_sae!(SAE);
    let a = a.as_f32x4();
    let b = b.as_f32x4();
    let src = src.as_f32x4();
    let r = vgetexpss(a, b, src, k, SAE);
    transmute(r)
}

/// Convert the exponent of the lower single-precision (32-bit) floating-point element in b to a single-precision (32-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getexp_round_ss&expand=2858)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgetexpss, SAE = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm_maskz_getexp_round_ss<const SAE: i32>(
    k: __mmask8,
    a: __m128,
    b: __m128,
) -> __m128 {
    static_assert_sae!(SAE);
    let a = a.as_f32x4();
    let b = b.as_f32x4();
    let zero = _mm_setzero_ps().as_f32x4();
    let r = vgetexpss(a, b, zero, k, SAE);
    transmute(r)
}

/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getexp_round_sd&expand=2853)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgetexpsd, SAE = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm_getexp_round_sd<const SAE: i32>(a: __m128d, b: __m128d) -> __m128d {
    static_assert_sae!(SAE);
    let a = a.as_f64x2();
    let b = b.as_f64x2();
    let zero = _mm_setzero_pd().as_f64x2();
    let r = vgetexpsd(a, b, zero, 0b1, SAE);
    transmute(r)
}

/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getexp_round_sd&expand=2854)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgetexpsd, SAE = 8))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_mask_getexp_round_sd<const SAE: i32>(
    src: __m128d,
    k: __mmask8,
    a: __m128d,
    b: __m128d,
) -> __m128d {
    static_assert_sae!(SAE);
    let a = a.as_f64x2();
    let b = b.as_f64x2();
    let src = src.as_f64x2();
    let r = vgetexpsd(a, b, src, k, SAE);
    transmute(r)
}

/// Convert the exponent of the lower double-precision (64-bit) floating-point element in b to a double-precision (64-bit) floating-point number representing the integer exponent, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates floor(log2(x)) for the lower element.\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getexp_round_sd&expand=2855)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgetexpsd, SAE = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm_maskz_getexp_round_sd<const SAE: i32>(
    k: __mmask8,
    a: __m128d,
    b: __m128d,
) -> __m128d {
    static_assert_sae!(SAE);
    let a = a.as_f64x2();
    let b = b.as_f64x2();
    let zero = _mm_setzero_pd().as_f64x2();
    let r = vgetexpsd(a, b, zero, k, SAE);
    transmute(r)
}

/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
///    _MM_MANT_NORM_1_2     // interval [1, 2)\
///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
/// The sign is determined by sc which can take the following values:\
///    _MM_MANT_SIGN_src     // sign = sign(src)\
///    _MM_MANT_SIGN_zero    // sign = 0\
///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getmant_round_ss&expand=2892)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgetmantss, NORM = 0, SIGN = 0, SAE = 4))]
#[rustc_legacy_const_generics(2, 3, 4)]
pub unsafe fn _mm_getmant_round_ss<
    const NORM: _MM_MANTISSA_NORM_ENUM,
    const SIGN: _MM_MANTISSA_SIGN_ENUM,
    const SAE: i32,
>(
    a: __m128,
    b: __m128,
) -> __m128 {
    static_assert_imm4!(NORM);
    static_assert_imm2!(SIGN);
    static_assert_mantissas_sae!(SAE);
    let a = a.as_f32x4();
    let b = b.as_f32x4();
    let zero = _mm_setzero_ps().as_f32x4();
    let r = vgetmantss(a, b, SIGN << 2 | NORM, zero, 0b1, SAE);
    transmute(r)
}

/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
///    _MM_MANT_NORM_1_2     // interval [1, 2)\
///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
/// The sign is determined by sc which can take the following values:\
///    _MM_MANT_SIGN_src     // sign = sign(src)\
///    _MM_MANT_SIGN_zero    // sign = 0\
///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getmant_round_ss&expand=2893)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgetmantss, NORM = 0, SIGN = 0, SAE = 4))]
#[rustc_legacy_const_generics(4, 5, 6)]
pub unsafe fn _mm_mask_getmant_round_ss<
    const NORM: _MM_MANTISSA_NORM_ENUM,
    const SIGN: _MM_MANTISSA_SIGN_ENUM,
    const SAE: i32,
>(
    src: __m128,
    k: __mmask8,
    a: __m128,
    b: __m128,
) -> __m128 {
    static_assert_imm4!(NORM);
    static_assert_imm2!(SIGN);
    static_assert_mantissas_sae!(SAE);
    let a = a.as_f32x4();
    let b = b.as_f32x4();
    let src = src.as_f32x4();
    let r = vgetmantss(a, b, SIGN << 2 | NORM, src, k, SAE);
    transmute(r)
}

/// Normalize the mantissas of the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
///    _MM_MANT_NORM_1_2     // interval [1, 2)\
///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
/// The sign is determined by sc which can take the following values:\
///    _MM_MANT_SIGN_src     // sign = sign(src)\
///    _MM_MANT_SIGN_zero    // sign = 0\
///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getmant_round_ss&expand=2894)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgetmantss, NORM = 0, SIGN = 0, SAE = 4))]
#[rustc_legacy_const_generics(3, 4, 5)]
pub unsafe fn _mm_maskz_getmant_round_ss<
    const NORM: _MM_MANTISSA_NORM_ENUM,
    const SIGN: _MM_MANTISSA_SIGN_ENUM,
    const SAE: i32,
>(
    k: __mmask8,
    a: __m128,
    b: __m128,
) -> __m128 {
    static_assert_imm4!(NORM);
    static_assert_imm2!(SIGN);
    static_assert_mantissas_sae!(SAE);
    let a = a.as_f32x4();
    let b = b.as_f32x4();
    let zero = _mm_setzero_ps().as_f32x4();
    let r = vgetmantss(a, b, SIGN << 2 | NORM, zero, k, SAE);
    transmute(r)
}

/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
///    _MM_MANT_NORM_1_2     // interval [1, 2)\
///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
/// The sign is determined by sc which can take the following values:\
///    _MM_MANT_SIGN_src     // sign = sign(src)\
///    _MM_MANT_SIGN_zero    // sign = 0\
///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_getmant_round_sd&expand=2889)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgetmantsd, NORM = 0, SIGN = 0, SAE = 4))]
#[rustc_legacy_const_generics(2, 3, 4)]
pub unsafe fn _mm_getmant_round_sd<
    const NORM: _MM_MANTISSA_NORM_ENUM,
    const SIGN: _MM_MANTISSA_SIGN_ENUM,
    const SAE: i32,
>(
    a: __m128d,
    b: __m128d,
) -> __m128d {
    static_assert_imm4!(NORM);
    static_assert_imm2!(SIGN);
    static_assert_mantissas_sae!(SAE);
    let a = a.as_f64x2();
    let b = b.as_f64x2();
    let zero = _mm_setzero_pd().as_f64x2();
    let r = vgetmantsd(a, b, SIGN << 2 | NORM, zero, 0b1, SAE);
    transmute(r)
}

/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
///    _MM_MANT_NORM_1_2     // interval [1, 2)\
///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
/// The sign is determined by sc which can take the following values:\
///    _MM_MANT_SIGN_src     // sign = sign(src)\
///    _MM_MANT_SIGN_zero    // sign = 0\
///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_getmant_round_sd&expand=2890)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgetmantsd, NORM = 0, SIGN = 0, SAE = 4))]
#[rustc_legacy_const_generics(4, 5, 6)]
pub unsafe fn _mm_mask_getmant_round_sd<
    const NORM: _MM_MANTISSA_NORM_ENUM,
    const SIGN: _MM_MANTISSA_SIGN_ENUM,
    const SAE: i32,
>(
    src: __m128d,
    k: __mmask8,
    a: __m128d,
    b: __m128d,
) -> __m128d {
    static_assert_imm4!(NORM);
    static_assert_imm2!(SIGN);
    static_assert_mantissas_sae!(SAE);
    let a = a.as_f64x2();
    let b = b.as_f64x2();
    let src = src.as_f64x2();
    let r = vgetmantsd(a, b, SIGN << 2 | NORM, src, k, SAE);
    transmute(r)
}

/// Normalize the mantissas of the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. This intrinsic essentially calculates ±(2^k)*|x.significand|, where k depends on the interval range defined by interv and the sign depends on sc and the source sign.\
/// The mantissa is normalized to the interval specified by interv, which can take the following values:\
///    _MM_MANT_NORM_1_2     // interval [1, 2)\
///    _MM_MANT_NORM_p5_2    // interval [0.5, 2)\
///    _MM_MANT_NORM_p5_1    // interval [0.5, 1)\
///    _MM_MANT_NORM_p75_1p5 // interval [0.75, 1.5)\
/// The sign is determined by sc which can take the following values:\
///    _MM_MANT_SIGN_src     // sign = sign(src)\
///    _MM_MANT_SIGN_zero    // sign = 0\
///    _MM_MANT_SIGN_nan     // dst = NaN if sign(src) = 1\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_getmant_round_sd&expand=2891)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vgetmantsd, NORM = 0, SIGN = 0, SAE = 4))]
#[rustc_legacy_const_generics(3, 4, 5)]
pub unsafe fn _mm_maskz_getmant_round_sd<
    const NORM: _MM_MANTISSA_NORM_ENUM,
    const SIGN: _MM_MANTISSA_SIGN_ENUM,
    const SAE: i32,
>(
    k: __mmask8,
    a: __m128d,
    b: __m128d,
) -> __m128d {
    static_assert_imm4!(NORM);
    static_assert_imm2!(SIGN);
    static_assert_mantissas_sae!(SAE);
    let a = a.as_f64x2();
    let b = b.as_f64x2();
    let zero = _mm_setzero_pd().as_f64x2();
    let r = vgetmantsd(a, b, SIGN << 2 | NORM, zero, k, SAE);
    transmute(r)
}

/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
///    _MM_FROUND_TO_NEG_INF     // round down\
///    _MM_FROUND_TO_POS_INF     // round up\
///    _MM_FROUND_TO_ZERO        // truncate\
///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE\
///
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_roundscale_round_ss&expand=4796)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vrndscaless, IMM8 = 0, SAE = 8))]
#[rustc_legacy_const_generics(2, 3)]
pub unsafe fn _mm_roundscale_round_ss<const IMM8: i32, const SAE: i32>(
    a: __m128,
    b: __m128,
) -> __m128 {
    static_assert_imm8!(IMM8);
    static_assert_mantissas_sae!(SAE);
    let a = a.as_f32x4();
    let b = b.as_f32x4();
    let zero = _mm_setzero_ps().as_f32x4();
    let r = vrndscaless(a, b, zero, 0b11111111, IMM8, SAE);
    transmute(r)
}

/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
///    _MM_FROUND_TO_NEG_INF     // round down\
///    _MM_FROUND_TO_POS_INF     // round up\
///    _MM_FROUND_TO_ZERO        // truncate\
///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE\
///
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_roundscale_round_ss&expand=4794)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vrndscaless, IMM8 = 0, SAE = 8))]
#[rustc_legacy_const_generics(4, 5)]
pub unsafe fn _mm_mask_roundscale_round_ss<const IMM8: i32, const SAE: i32>(
    src: __m128,
    k: __mmask8,
    a: __m128,
    b: __m128,
) -> __m128 {
    static_assert_imm8!(IMM8);
    static_assert_mantissas_sae!(SAE);
    let a = a.as_f32x4();
    let b = b.as_f32x4();
    let src = src.as_f32x4();
    let r = vrndscaless(a, b, src, k, IMM8, SAE);
    transmute(r)
}

/// Round the lower single-precision (32-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
///    _MM_FROUND_TO_NEG_INF     // round down\
///    _MM_FROUND_TO_POS_INF     // round up\
///    _MM_FROUND_TO_ZERO        // truncate\
///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE\
///
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_roundscale_round_ss&expand=4795)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vrndscaless, IMM8 = 0, SAE = 8))]
#[rustc_legacy_const_generics(3, 4)]
pub unsafe fn _mm_maskz_roundscale_round_ss<const IMM8: i32, const SAE: i32>(
    k: __mmask8,
    a: __m128,
    b: __m128,
) -> __m128 {
    static_assert_imm8!(IMM8);
    static_assert_mantissas_sae!(SAE);
    let a = a.as_f32x4();
    let b = b.as_f32x4();
    let zero = _mm_setzero_ps().as_f32x4();
    let r = vrndscaless(a, b, zero, k, IMM8, SAE);
    transmute(r)
}

/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
///    _MM_FROUND_TO_NEG_INF     // round down\
///    _MM_FROUND_TO_POS_INF     // round up\
///    _MM_FROUND_TO_ZERO        // truncate\
///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE\
///
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_roundscale_round_sd&expand=4793)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vrndscalesd, IMM8 = 0, SAE = 8))]
#[rustc_legacy_const_generics(2, 3)]
pub unsafe fn _mm_roundscale_round_sd<const IMM8: i32, const SAE: i32>(
    a: __m128d,
    b: __m128d,
) -> __m128d {
    static_assert_imm8!(IMM8);
    static_assert_mantissas_sae!(SAE);
    let a = a.as_f64x2();
    let b = b.as_f64x2();
    let zero = _mm_setzero_pd().as_f64x2();
    let r = vrndscalesd(a, b, zero, 0b11111111, IMM8, SAE);
    transmute(r)
}

/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
///    _MM_FROUND_TO_NEG_INF     // round down\
///    _MM_FROUND_TO_POS_INF     // round up\
///    _MM_FROUND_TO_ZERO        // truncate\
///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE\
///
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_roundscale_round_sd&expand=4791)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vrndscalesd, IMM8 = 0, SAE = 8))]
#[rustc_legacy_const_generics(4, 5)]
pub unsafe fn _mm_mask_roundscale_round_sd<const IMM8: i32, const SAE: i32>(
    src: __m128d,
    k: __mmask8,
    a: __m128d,
    b: __m128d,
) -> __m128d {
    static_assert_imm8!(IMM8);
    static_assert_mantissas_sae!(SAE);
    let a = a.as_f64x2();
    let b = b.as_f64x2();
    let src = src.as_f64x2();
    let r = vrndscalesd(a, b, src, k, IMM8, SAE);
    transmute(r)
}

/// Round the lower double-precision (64-bit) floating-point element in b to the number of fraction bits specified by imm8, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
/// Rounding is done according to the imm8\[2:0\] parameter, which can be one of:\
///    _MM_FROUND_TO_NEAREST_INT // round to nearest\
///    _MM_FROUND_TO_NEG_INF     // round down\
///    _MM_FROUND_TO_POS_INF     // round up\
///    _MM_FROUND_TO_ZERO        // truncate\
///    _MM_FROUND_CUR_DIRECTION  // use MXCSR.RC; see _MM_SET_ROUNDING_MODE\
///
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_roundscale_round_sd&expand=4792)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vrndscalesd, IMM8 = 0, SAE = 8))]
#[rustc_legacy_const_generics(3, 4)]
pub unsafe fn _mm_maskz_roundscale_round_sd<const IMM8: i32, const SAE: i32>(
    k: __mmask8,
    a: __m128d,
    b: __m128d,
) -> __m128d {
    static_assert_imm8!(IMM8);
    static_assert_mantissas_sae!(SAE);
    let a = a.as_f64x2();
    let b = b.as_f64x2();
    let zero = _mm_setzero_pd().as_f64x2();
    let r = vrndscalesd(a, b, zero, k, IMM8, SAE);
    transmute(r)
}

/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_scalef_round_ss&expand=4895)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vscalefss, ROUNDING = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm_scalef_round_ss<const ROUNDING: i32>(a: __m128, b: __m128) -> __m128 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x4();
    let b = b.as_f32x4();
    let zero = _mm_setzero_ps().as_f32x4();
    let r = vscalefss(a, b, zero, 0b11111111, ROUNDING);
    transmute(r)
}

/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_scalef_round_ss&expand=4893)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vscalefss, ROUNDING = 8))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_mask_scalef_round_ss<const ROUNDING: i32>(
    src: __m128,
    k: __mmask8,
    a: __m128,
    b: __m128,
) -> __m128 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x4();
    let b = b.as_f32x4();
    let src = src.as_f32x4();
    let r = vscalefss(a, b, src, k, ROUNDING);
    transmute(r)
}

/// Scale the packed single-precision (32-bit) floating-point elements in a using values from b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_scalef_round_ss&expand=4894)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vscalefss, ROUNDING = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm_maskz_scalef_round_ss<const ROUNDING: i32>(
    k: __mmask8,
    a: __m128,
    b: __m128,
) -> __m128 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x4();
    let b = b.as_f32x4();
    let zero = _mm_setzero_ps().as_f32x4();
    let r = vscalefss(a, b, zero, k, ROUNDING);
    transmute(r)
}

/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_scalef_round_sd&expand=4892)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vscalefsd, ROUNDING = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm_scalef_round_sd<const ROUNDING: i32>(a: __m128d, b: __m128d) -> __m128d {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f64x2();
    let b = b.as_f64x2();
    let zero = _mm_setzero_pd().as_f64x2();
    let r = vscalefsd(a, b, zero, 0b11111111, ROUNDING);
    transmute(r)
}

/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_scalef_round_sd&expand=4890)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vscalefsd, ROUNDING = 8))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_mask_scalef_round_sd<const ROUNDING: i32>(
    src: __m128d,
    k: __mmask8,
    a: __m128d,
    b: __m128d,
) -> __m128d {
    let a = a.as_f64x2();
    let b = b.as_f64x2();
    let src = src.as_f64x2();
    let r = vscalefsd(a, b, src, k, ROUNDING);
    transmute(r)
}

/// Scale the packed double-precision (64-bit) floating-point elements in a using values from b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_scalef_round_sd&expand=4891)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vscalefsd, ROUNDING = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm_maskz_scalef_round_sd<const ROUNDING: i32>(
    k: __mmask8,
    a: __m128d,
    b: __m128d,
) -> __m128d {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f64x2();
    let b = b.as_f64x2();
    let zero = _mm_setzero_pd().as_f64x2();
    let r = vscalefsd(a, b, zero, k, ROUNDING);
    transmute(r)
}

/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fmadd_round_ss&expand=2573)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd213ss, ROUNDING = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm_fmadd_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c: __m128) -> __m128 {
    static_assert_rounding!(ROUNDING);
    let extracta: f32 = simd_extract(a, 0);
    let extractb: f32 = simd_extract(b, 0);
    let extractc: f32 = simd_extract(c, 0);
    let r = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
    let r = simd_insert(a, 0, r);
    transmute(r)
}

/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmadd_round_ss&expand=2574)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd213ss, ROUNDING = 8))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_mask_fmadd_round_ss<const ROUNDING: i32>(
    a: __m128,
    k: __mmask8,
    b: __m128,
    c: __m128,
) -> __m128 {
    static_assert_rounding!(ROUNDING);
    let mut fmadd: f32 = simd_extract(a, 0);
    if (k & 0b00000001) != 0 {
        let extractb: f32 = simd_extract(b, 0);
        let extractc: f32 = simd_extract(c, 0);
        fmadd = vfmadd132ss(fmadd, extractb, extractc, ROUNDING);
    }
    let r = simd_insert(a, 0, fmadd);
    transmute(r)
}

/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmadd_round_ss&expand=2576)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd213ss, ROUNDING = 8))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_maskz_fmadd_round_ss<const ROUNDING: i32>(
    k: __mmask8,
    a: __m128,
    b: __m128,
    c: __m128,
) -> __m128 {
    static_assert_rounding!(ROUNDING);
    let mut fmadd: f32 = 0.;
    if (k & 0b00000001) != 0 {
        let extracta: f32 = simd_extract(a, 0);
        let extractb: f32 = simd_extract(b, 0);
        let extractc: f32 = simd_extract(c, 0);
        fmadd = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
    }
    let r = simd_insert(a, 0, fmadd);
    transmute(r)
}

/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmadd_round_ss&expand=2575)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd213ss, ROUNDING = 8))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_mask3_fmadd_round_ss<const ROUNDING: i32>(
    a: __m128,
    b: __m128,
    c: __m128,
    k: __mmask8,
) -> __m128 {
    static_assert_rounding!(ROUNDING);
    let mut fmadd: f32 = simd_extract(c, 0);
    if (k & 0b00000001) != 0 {
        let extracta: f32 = simd_extract(a, 0);
        let extractb: f32 = simd_extract(b, 0);
        fmadd = vfmadd132ss(extracta, extractb, fmadd, ROUNDING);
    }
    let r = simd_insert(c, 0, fmadd);
    transmute(r)
}

/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fmadd_round_sd&expand=2569)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd213sd, ROUNDING = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm_fmadd_round_sd<const ROUNDING: i32>(
    a: __m128d,
    b: __m128d,
    c: __m128d,
) -> __m128d {
    static_assert_rounding!(ROUNDING);
    let extracta: f64 = simd_extract(a, 0);
    let extractb: f64 = simd_extract(b, 0);
    let extractc: f64 = simd_extract(c, 0);
    let fmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
    let r = simd_insert(a, 0, fmadd);
    transmute(r)
}

/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmadd_round_sd&expand=2570)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd213sd, ROUNDING = 8))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_mask_fmadd_round_sd<const ROUNDING: i32>(
    a: __m128d,
    k: __mmask8,
    b: __m128d,
    c: __m128d,
) -> __m128d {
    static_assert_rounding!(ROUNDING);
    let mut fmadd: f64 = simd_extract(a, 0);
    if (k & 0b00000001) != 0 {
        let extractb: f64 = simd_extract(b, 0);
        let extractc: f64 = simd_extract(c, 0);
        fmadd = vfmadd132sd(fmadd, extractb, extractc, ROUNDING);
    }
    let r = simd_insert(a, 0, fmadd);
    transmute(r)
}

/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmadd_round_sd&expand=2572)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd213sd, ROUNDING = 8))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_maskz_fmadd_round_sd<const ROUNDING: i32>(
    k: __mmask8,
    a: __m128d,
    b: __m128d,
    c: __m128d,
) -> __m128d {
    static_assert_rounding!(ROUNDING);
    let mut fmadd: f64 = 0.;
    if (k & 0b00000001) != 0 {
        let extracta: f64 = simd_extract(a, 0);
        let extractb: f64 = simd_extract(b, 0);
        let extractc: f64 = simd_extract(c, 0);
        fmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
    }
    let r = simd_insert(a, 0, fmadd);
    transmute(r)
}

/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmadd_round_Sd&expand=2571)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmadd213sd, ROUNDING = 8))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_mask3_fmadd_round_sd<const ROUNDING: i32>(
    a: __m128d,
    b: __m128d,
    c: __m128d,
    k: __mmask8,
) -> __m128d {
    static_assert_rounding!(ROUNDING);
    let mut fmadd: f64 = simd_extract(c, 0);
    if (k & 0b00000001) != 0 {
        let extracta: f64 = simd_extract(a, 0);
        let extractb: f64 = simd_extract(b, 0);
        fmadd = vfmadd132sd(extracta, extractb, fmadd, ROUNDING);
    }
    let r = simd_insert(c, 0, fmadd);
    transmute(r)
}

/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fmsub_round_ss&expand=2659)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmsub213ss, ROUNDING = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm_fmsub_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c: __m128) -> __m128 {
    static_assert_rounding!(ROUNDING);
    let extracta: f32 = simd_extract(a, 0);
    let extractb: f32 = simd_extract(b, 0);
    let extractc: f32 = simd_extract(c, 0);
    let extractc = -extractc;
    let fmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
    let r = simd_insert(a, 0, fmsub);
    transmute(r)
}

/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmsub_round_ss&expand=2660)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmsub213ss, ROUNDING = 8))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_mask_fmsub_round_ss<const ROUNDING: i32>(
    a: __m128,
    k: __mmask8,
    b: __m128,
    c: __m128,
) -> __m128 {
    static_assert_rounding!(ROUNDING);
    let mut fmsub: f32 = simd_extract(a, 0);
    if (k & 0b00000001) != 0 {
        let extractb: f32 = simd_extract(b, 0);
        let extractc: f32 = simd_extract(c, 0);
        let extractc = -extractc;
        fmsub = vfmadd132ss(fmsub, extractb, extractc, ROUNDING);
    }
    let r = simd_insert(a, 0, fmsub);
    transmute(r)
}

/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmsub_round_ss&expand=2662)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmsub213ss, ROUNDING = 8))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_maskz_fmsub_round_ss<const ROUNDING: i32>(
    k: __mmask8,
    a: __m128,
    b: __m128,
    c: __m128,
) -> __m128 {
    static_assert_rounding!(ROUNDING);
    let mut fmsub: f32 = 0.;
    if (k & 0b00000001) != 0 {
        let extracta: f32 = simd_extract(a, 0);
        let extractb: f32 = simd_extract(b, 0);
        let extractc: f32 = simd_extract(c, 0);
        let extractc = -extractc;
        fmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
    }
    let r = simd_insert(a, 0, fmsub);
    transmute(r)
}

/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmsub_round_ss&expand=2661)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmsub213ss, ROUNDING = 8))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_mask3_fmsub_round_ss<const ROUNDING: i32>(
    a: __m128,
    b: __m128,
    c: __m128,
    k: __mmask8,
) -> __m128 {
    static_assert_rounding!(ROUNDING);
    let mut fmsub: f32 = simd_extract(c, 0);
    if (k & 0b00000001) != 0 {
        let extracta: f32 = simd_extract(a, 0);
        let extractb: f32 = simd_extract(b, 0);
        let extractc = -fmsub;
        fmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
    }
    let r = simd_insert(c, 0, fmsub);
    transmute(r)
}

/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fmsub_round_sd&expand=2655)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmsub213sd, ROUNDING = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm_fmsub_round_sd<const ROUNDING: i32>(
    a: __m128d,
    b: __m128d,
    c: __m128d,
) -> __m128d {
    static_assert_rounding!(ROUNDING);
    let extracta: f64 = simd_extract(a, 0);
    let extractb: f64 = simd_extract(b, 0);
    let extractc: f64 = simd_extract(c, 0);
    let extractc = -extractc;
    let fmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
    let r = simd_insert(a, 0, fmsub);
    transmute(r)
}

/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fmsub_round_sd&expand=2656)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmsub213sd, ROUNDING = 8))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_mask_fmsub_round_sd<const ROUNDING: i32>(
    a: __m128d,
    k: __mmask8,
    b: __m128d,
    c: __m128d,
) -> __m128d {
    static_assert_rounding!(ROUNDING);
    let mut fmsub: f64 = simd_extract(a, 0);
    if (k & 0b00000001) != 0 {
        let extractb: f64 = simd_extract(b, 0);
        let extractc: f64 = simd_extract(c, 0);
        let extractc = -extractc;
        fmsub = vfmadd132sd(fmsub, extractb, extractc, ROUNDING);
    }
    let r = simd_insert(a, 0, fmsub);
    transmute(r)
}

/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fmsub_round_sd&expand=2658)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmsub213sd, ROUNDING = 8))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_maskz_fmsub_round_sd<const ROUNDING: i32>(
    k: __mmask8,
    a: __m128d,
    b: __m128d,
    c: __m128d,
) -> __m128d {
    static_assert_rounding!(ROUNDING);
    let mut fmsub: f64 = 0.;
    if (k & 0b00000001) != 0 {
        let extracta: f64 = simd_extract(a, 0);
        let extractb: f64 = simd_extract(b, 0);
        let extractc: f64 = simd_extract(c, 0);
        let extractc = -extractc;
        fmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
    }
    let r = simd_insert(a, 0, fmsub);
    transmute(r)
}

/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fmsub_round_sd&expand=2657)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfmsub213sd, ROUNDING = 8))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_mask3_fmsub_round_sd<const ROUNDING: i32>(
    a: __m128d,
    b: __m128d,
    c: __m128d,
    k: __mmask8,
) -> __m128d {
    static_assert_rounding!(ROUNDING);
    let mut fmsub: f64 = simd_extract(c, 0);
    if (k & 0b00000001) != 0 {
        let extracta: f64 = simd_extract(a, 0);
        let extractb: f64 = simd_extract(b, 0);
        let extractc = -fmsub;
        fmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
    }
    let r = simd_insert(c, 0, fmsub);
    transmute(r)
}

/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fnmadd_round_ss&expand=2739)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfnmadd213ss, ROUNDING = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm_fnmadd_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c: __m128) -> __m128 {
    static_assert_rounding!(ROUNDING);
    let extracta: f32 = simd_extract(a, 0);
    let extracta = -extracta;
    let extractb: f32 = simd_extract(b, 0);
    let extractc: f32 = simd_extract(c, 0);
    let fnmadd = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
    let r = simd_insert(a, 0, fnmadd);
    transmute(r)
}

/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fnmadd_round_ss&expand=2740)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfnmadd213ss, ROUNDING = 8))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_mask_fnmadd_round_ss<const ROUNDING: i32>(
    a: __m128,
    k: __mmask8,
    b: __m128,
    c: __m128,
) -> __m128 {
    static_assert_rounding!(ROUNDING);
    let mut fnmadd: f32 = simd_extract(a, 0);
    if (k & 0b00000001) != 0 {
        let extracta = -fnmadd;
        let extractb: f32 = simd_extract(b, 0);
        let extractc: f32 = simd_extract(c, 0);
        fnmadd = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
    }
    let r = simd_insert(a, 0, fnmadd);
    transmute(r)
}

/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fnmadd_round_ss&expand=2742)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfnmadd213ss, ROUNDING = 8))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_maskz_fnmadd_round_ss<const ROUNDING: i32>(
    k: __mmask8,
    a: __m128,
    b: __m128,
    c: __m128,
) -> __m128 {
    static_assert_rounding!(ROUNDING);
    let mut fnmadd: f32 = 0.;
    if (k & 0b00000001) != 0 {
        let extracta: f32 = simd_extract(a, 0);
        let extracta = -extracta;
        let extractb: f32 = simd_extract(b, 0);
        let extractc: f32 = simd_extract(c, 0);
        fnmadd = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
    }
    let r = simd_insert(a, 0, fnmadd);
    transmute(r)
}

/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fnmadd_round_ss&expand=2741)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfnmadd213ss, ROUNDING = 8))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_mask3_fnmadd_round_ss<const ROUNDING: i32>(
    a: __m128,
    b: __m128,
    c: __m128,
    k: __mmask8,
) -> __m128 {
    static_assert_rounding!(ROUNDING);
    let mut fnmadd: f32 = simd_extract(c, 0);
    if (k & 0b00000001) != 0 {
        let extracta: f32 = simd_extract(a, 0);
        let extracta = -extracta;
        let extractb: f32 = simd_extract(b, 0);
        fnmadd = vfmadd132ss(extracta, extractb, fnmadd, ROUNDING);
    }
    let r = simd_insert(c, 0, fnmadd);
    transmute(r)
}

/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fnmadd_round_sd&expand=2735)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfnmadd213sd, ROUNDING = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm_fnmadd_round_sd<const ROUNDING: i32>(
    a: __m128d,
    b: __m128d,
    c: __m128d,
) -> __m128d {
    static_assert_rounding!(ROUNDING);
    let extracta: f64 = simd_extract(a, 0);
    let extracta = -extracta;
    let extractb: f64 = simd_extract(b, 0);
    let extractc: f64 = simd_extract(c, 0);
    let fnmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
    let r = simd_insert(a, 0, fnmadd);
    transmute(r)
}

/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fnmadd_round_sd&expand=2736)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfnmadd213sd, ROUNDING = 8))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_mask_fnmadd_round_sd<const ROUNDING: i32>(
    a: __m128d,
    k: __mmask8,
    b: __m128d,
    c: __m128d,
) -> __m128d {
    static_assert_rounding!(ROUNDING);
    let mut fnmadd: f64 = simd_extract(a, 0);
    if (k & 0b00000001) != 0 {
        let extracta = -fnmadd;
        let extractb: f64 = simd_extract(b, 0);
        let extractc: f64 = simd_extract(c, 0);
        fnmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
    }
    let r = simd_insert(a, 0, fnmadd);
    transmute(r)
}

/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fnmadd_round_sd&expand=2738)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfnmadd213sd, ROUNDING = 8))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_maskz_fnmadd_round_sd<const ROUNDING: i32>(
    k: __mmask8,
    a: __m128d,
    b: __m128d,
    c: __m128d,
) -> __m128d {
    static_assert_rounding!(ROUNDING);
    let mut fnmadd: f64 = 0.;
    if (k & 0b00000001) != 0 {
        let extracta: f64 = simd_extract(a, 0);
        let extracta = -extracta;
        let extractb: f64 = simd_extract(b, 0);
        let extractc: f64 = simd_extract(c, 0);
        fnmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
    }
    let r = simd_insert(a, 0, fnmadd);
    transmute(r)
}

/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fnmadd_round_Sd&expand=2737)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfnmadd213sd, ROUNDING = 8))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_mask3_fnmadd_round_sd<const ROUNDING: i32>(
    a: __m128d,
    b: __m128d,
    c: __m128d,
    k: __mmask8,
) -> __m128d {
    static_assert_rounding!(ROUNDING);
    let mut fnmadd: f64 = simd_extract(c, 0);
    if (k & 0b00000001) != 0 {
        let extracta: f64 = simd_extract(a, 0);
        let extracta = -extracta;
        let extractb: f64 = simd_extract(b, 0);
        fnmadd = vfmadd132sd(extracta, extractb, fnmadd, ROUNDING);
    }
    let r = simd_insert(c, 0, fnmadd);
    transmute(r)
}

/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, subtract the lower element in c from the negated intermediate result, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fnmsub_round_ss&expand=2787)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfnmsub213ss, ROUNDING = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm_fnmsub_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c: __m128) -> __m128 {
    static_assert_rounding!(ROUNDING);
    let extracta: f32 = simd_extract(a, 0);
    let extracta = -extracta;
    let extractb: f32 = simd_extract(b, 0);
    let extractc: f32 = simd_extract(c, 0);
    let extractc = -extractc;
    let fnmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
    let r = simd_insert(a, 0, fnmsub);
    transmute(r)
}

/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fnmsub_round_ss&expand=2788)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfnmsub213ss, ROUNDING = 8))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_mask_fnmsub_round_ss<const ROUNDING: i32>(
    a: __m128,
    k: __mmask8,
    b: __m128,
    c: __m128,
) -> __m128 {
    static_assert_rounding!(ROUNDING);
    let mut fnmsub: f32 = simd_extract(a, 0);
    if (k & 0b00000001) != 0 {
        let extracta = -fnmsub;
        let extractb: f32 = simd_extract(b, 0);
        let extractc: f32 = simd_extract(c, 0);
        let extractc = -extractc;
        fnmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
    }
    let r = simd_insert(a, 0, fnmsub);
    transmute(r)
}

/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fnmsub_round_ss&expand=2790)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfnmsub213ss, ROUNDING = 8))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_maskz_fnmsub_round_ss<const ROUNDING: i32>(
    k: __mmask8,
    a: __m128,
    b: __m128,
    c: __m128,
) -> __m128 {
    static_assert_rounding!(ROUNDING);
    let mut fnmsub: f32 = 0.;
    if (k & 0b00000001) != 0 {
        let extracta: f32 = simd_extract(a, 0);
        let extracta = -extracta;
        let extractb: f32 = simd_extract(b, 0);
        let extractc: f32 = simd_extract(c, 0);
        let extractc = -extractc;
        fnmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
    }
    let r = simd_insert(a, 0, fnmsub);
    transmute(r)
}

/// Multiply the lower single-precision (32-bit) floating-point elements in a and b, subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fnmsub_round_ss&expand=2789)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfnmsub213ss, ROUNDING = 8))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_mask3_fnmsub_round_ss<const ROUNDING: i32>(
    a: __m128,
    b: __m128,
    c: __m128,
    k: __mmask8,
) -> __m128 {
    static_assert_rounding!(ROUNDING);
    let mut fnmsub: f32 = simd_extract(c, 0);
    if (k & 0b00000001) != 0 {
        let extracta: f32 = simd_extract(a, 0);
        let extracta = -extracta;
        let extractb: f32 = simd_extract(b, 0);
        let extractc = -fnmsub;
        fnmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
    }
    let r = simd_insert(c, 0, fnmsub);
    transmute(r)
}

/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fnmsub_round_sd&expand=2783)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfnmsub213sd, ROUNDING = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm_fnmsub_round_sd<const ROUNDING: i32>(
    a: __m128d,
    b: __m128d,
    c: __m128d,
) -> __m128d {
    static_assert_rounding!(ROUNDING);
    let extracta: f64 = simd_extract(a, 0);
    let extracta = -extracta;
    let extractb: f64 = simd_extract(b, 0);
    let extractc: f64 = simd_extract(c, 0);
    let extractc = -extractc;
    let fnmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
    let r = simd_insert(a, 0, fnmsub);
    transmute(r)
}

/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fnmsub_round_sd&expand=2784)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfnmsub213sd, ROUNDING = 8))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_mask_fnmsub_round_sd<const ROUNDING: i32>(
    a: __m128d,
    k: __mmask8,
    b: __m128d,
    c: __m128d,
) -> __m128d {
    static_assert_rounding!(ROUNDING);
    let mut fnmsub: f64 = simd_extract(a, 0);
    if (k & 0b00000001) != 0 {
        let extracta = -fnmsub;
        let extractb: f64 = simd_extract(b, 0);
        let extractc: f64 = simd_extract(c, 0);
        let extractc = -extractc;
        fnmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
    }
    let r = simd_insert(a, 0, fnmsub);
    transmute(r)
}

/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fnmsub_round_sd&expand=2786)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfnmsub213sd, ROUNDING = 8))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_maskz_fnmsub_round_sd<const ROUNDING: i32>(
    k: __mmask8,
    a: __m128d,
    b: __m128d,
    c: __m128d,
) -> __m128d {
    static_assert_rounding!(ROUNDING);
    let mut fnmsub: f64 = 0.;
    if (k & 0b00000001) != 0 {
        let extracta: f64 = simd_extract(a, 0);
        let extracta = -extracta;
        let extractb: f64 = simd_extract(b, 0);
        let extractc: f64 = simd_extract(c, 0);
        let extractc = -extractc;
        fnmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
    }
    let r = simd_insert(a, 0, fnmsub);
    transmute(r)
}

/// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask3_fnmsub_round_sd&expand=2785)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfnmsub213sd, ROUNDING = 8))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_mask3_fnmsub_round_sd<const ROUNDING: i32>(
    a: __m128d,
    b: __m128d,
    c: __m128d,
    k: __mmask8,
) -> __m128d {
    static_assert_rounding!(ROUNDING);
    let mut fnmsub: f64 = simd_extract(c, 0);
    if (k & 0b00000001) != 0 {
        let extracta: f64 = simd_extract(a, 0);
        let extracta = -extracta;
        let extractb: f64 = simd_extract(b, 0);
        let extractc = -fnmsub;
        fnmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
    }
    let r = simd_insert(c, 0, fnmsub);
    transmute(r)
}

/// Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fixupimm_ss&expand=2517)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfixupimmss, IMM8 = 0))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm_fixupimm_ss<const IMM8: i32>(a: __m128, b: __m128, c: __m128i) -> __m128 {
    static_assert_imm8!(IMM8);
    let a = a.as_f32x4();
    let b = b.as_f32x4();
    let c = c.as_i32x4();
    let r = vfixupimmss(a, b, c, IMM8, 0b11111111, _MM_FROUND_CUR_DIRECTION);
    let fixupimm: f32 = simd_extract(r, 0);
    let r = simd_insert(a, 0, fixupimm);
    transmute(r)
}

/// Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fixupimm_ss&expand=2518)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfixupimmss, IMM8 = 0))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_mask_fixupimm_ss<const IMM8: i32>(
    a: __m128,
    k: __mmask8,
    b: __m128,
    c: __m128i,
) -> __m128 {
    static_assert_imm8!(IMM8);
    let a = a.as_f32x4();
    let b = b.as_f32x4();
    let c = c.as_i32x4();
    let fixupimm = vfixupimmss(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
    let fixupimm: f32 = simd_extract(fixupimm, 0);
    let r = simd_insert(a, 0, fixupimm);
    transmute(r)
}

/// Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fixupimm_ss&expand=2519)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfixupimmss, IMM8 = 0))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_maskz_fixupimm_ss<const IMM8: i32>(
    k: __mmask8,
    a: __m128,
    b: __m128,
    c: __m128i,
) -> __m128 {
    static_assert_imm8!(IMM8);
    let a = a.as_f32x4();
    let b = b.as_f32x4();
    let c = c.as_i32x4();
    let fixupimm = vfixupimmssz(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
    let fixupimm: f32 = simd_extract(fixupimm, 0);
    let r = simd_insert(a, 0, fixupimm);
    transmute(r)
}

/// Fix up the lower double-precision (64-bit) floating-point elements in a and b using the lower 64-bit integer in c, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. imm8 is used to set the required flags reporting.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fixupimm_sd&expand=2514)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfixupimmsd, IMM8 = 0))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm_fixupimm_sd<const IMM8: i32>(a: __m128d, b: __m128d, c: __m128i) -> __m128d {
    static_assert_imm8!(IMM8);
    let a = a.as_f64x2();
    let b = b.as_f64x2();
    let c = c.as_i64x2();
    let fixupimm = vfixupimmsd(a, b, c, IMM8, 0b11111111, _MM_FROUND_CUR_DIRECTION);
    let fixupimm: f64 = simd_extract(fixupimm, 0);
    let r = simd_insert(a, 0, fixupimm);
    transmute(r)
}

/// Fix up the lower double-precision (64-bit) floating-point elements in a and b using the lower 64-bit integer in c, store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. imm8 is used to set the required flags reporting.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fixupimm_sd&expand=2515)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfixupimmsd, IMM8 = 0))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_mask_fixupimm_sd<const IMM8: i32>(
    a: __m128d,
    k: __mmask8,
    b: __m128d,
    c: __m128i,
) -> __m128d {
    static_assert_imm8!(IMM8);
    let a = a.as_f64x2();
    let b = b.as_f64x2();
    let c = c.as_i64x2();
    let fixupimm = vfixupimmsd(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
    let fixupimm: f64 = simd_extract(fixupimm, 0);
    let r = simd_insert(a, 0, fixupimm);
    transmute(r)
}

/// Fix up the lower double-precision (64-bit) floating-point elements in a and b using the lower 64-bit integer in c, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. imm8 is used to set the required flags reporting.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fixupimm_sd&expand=2516)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfixupimmsd, IMM8 = 0))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_maskz_fixupimm_sd<const IMM8: i32>(
    k: __mmask8,
    a: __m128d,
    b: __m128d,
    c: __m128i,
) -> __m128d {
    static_assert_imm8!(IMM8);
    let a = a.as_f64x2();
    let b = b.as_f64x2();
    let c = c.as_i64x2();
    let fixupimm = vfixupimmsdz(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
    let fixupimm: f64 = simd_extract(fixupimm, 0);
    let r = simd_insert(a, 0, fixupimm);
    transmute(r)
}

/// Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fixupimm_round_ss&expand=2511)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfixupimmss, IMM8 = 0, SAE = 8))]
#[rustc_legacy_const_generics(3, 4)]
pub unsafe fn _mm_fixupimm_round_ss<const IMM8: i32, const SAE: i32>(
    a: __m128,
    b: __m128,
    c: __m128i,
) -> __m128 {
    static_assert_imm8!(IMM8);
    static_assert_mantissas_sae!(SAE);
    let a = a.as_f32x4();
    let b = b.as_f32x4();
    let c = c.as_i32x4();
    let r = vfixupimmss(a, b, c, IMM8, 0b11111111, SAE);
    let fixupimm: f32 = simd_extract(r, 0);
    let r = simd_insert(a, 0, fixupimm);
    transmute(r)
}

/// Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fixupimm_round_ss&expand=2512)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfixupimmss, IMM8 = 0, SAE = 8))]
#[rustc_legacy_const_generics(4, 5)]
pub unsafe fn _mm_mask_fixupimm_round_ss<const IMM8: i32, const SAE: i32>(
    a: __m128,
    k: __mmask8,
    b: __m128,
    c: __m128i,
) -> __m128 {
    static_assert_imm8!(IMM8);
    static_assert_mantissas_sae!(SAE);
    let a = a.as_f32x4();
    let b = b.as_f32x4();
    let c = c.as_i32x4();
    let r = vfixupimmss(a, b, c, IMM8, k, SAE);
    let fixupimm: f32 = simd_extract(r, 0);
    let r = simd_insert(a, 0, fixupimm);
    transmute(r)
}

/// Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fixupimm_round_ss&expand=2513)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfixupimmss, IMM8 = 0, SAE = 8))]
#[rustc_legacy_const_generics(4, 5)]
pub unsafe fn _mm_maskz_fixupimm_round_ss<const IMM8: i32, const SAE: i32>(
    k: __mmask8,
    a: __m128,
    b: __m128,
    c: __m128i,
) -> __m128 {
    static_assert_imm8!(IMM8);
    static_assert_mantissas_sae!(SAE);
    let a = a.as_f32x4();
    let b = b.as_f32x4();
    let c = c.as_i32x4();
    let r = vfixupimmssz(a, b, c, IMM8, k, SAE);
    let fixupimm: f32 = simd_extract(r, 0);
    let r = simd_insert(a, 0, fixupimm);
    transmute(r)
}

/// Fix up the lower double-precision (64-bit) floating-point elements in a and b using the lower 64-bit integer in c, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst. imm8 is used to set the required flags reporting.\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_fixupimm_round_sd&expand=2508)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfixupimmsd, IMM8 = 0, SAE = 8))]
#[rustc_legacy_const_generics(3, 4)]
pub unsafe fn _mm_fixupimm_round_sd<const IMM8: i32, const SAE: i32>(
    a: __m128d,
    b: __m128d,
    c: __m128i,
) -> __m128d {
    static_assert_imm8!(IMM8);
    static_assert_mantissas_sae!(SAE);
    let a = a.as_f64x2();
    let b = b.as_f64x2();
    let c = c.as_i64x2();
    let r = vfixupimmsd(a, b, c, IMM8, 0b11111111, SAE);
    let fixupimm: f64 = simd_extract(r, 0);
    let r = simd_insert(a, 0, fixupimm);
    transmute(r)
}

/// Fix up the lower double-precision (64-bit) floating-point elements in a and b using the lower 64-bit integer in c, store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. imm8 is used to set the required flags reporting.\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_fixupimm_round_sd&expand=2509)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfixupimmsd, IMM8 = 0, SAE = 8))]
#[rustc_legacy_const_generics(4, 5)]
pub unsafe fn _mm_mask_fixupimm_round_sd<const IMM8: i32, const SAE: i32>(
    a: __m128d,
    k: __mmask8,
    b: __m128d,
    c: __m128i,
) -> __m128d {
    static_assert_imm8!(IMM8);
    static_assert_mantissas_sae!(SAE);
    let a = a.as_f64x2();
    let b = b.as_f64x2();
    let c = c.as_i64x2();
    let r = vfixupimmsd(a, b, c, IMM8, k, SAE);
    let fixupimm: f64 = simd_extract(r, 0);
    let r = simd_insert(a, 0, fixupimm);
    transmute(r)
}

/// Fix up the lower double-precision (64-bit) floating-point elements in a and b using the lower 64-bit integer in c, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst. imm8 is used to set the required flags reporting.\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_fixupimm_round_sd&expand=2510)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vfixupimmsd, IMM8 = 0, SAE = 8))]
#[rustc_legacy_const_generics(4, 5)]
pub unsafe fn _mm_maskz_fixupimm_round_sd<const IMM8: i32, const SAE: i32>(
    k: __mmask8,
    a: __m128d,
    b: __m128d,
    c: __m128i,
) -> __m128d {
    static_assert_imm8!(IMM8);
    static_assert_mantissas_sae!(SAE);
    let a = a.as_f64x2();
    let b = b.as_f64x2();
    let c = c.as_i64x2();
    let r = vfixupimmsdz(a, b, c, IMM8, k, SAE);
    let fixupimm: f64 = simd_extract(r, 0);
    let r = simd_insert(a, 0, fixupimm);
    transmute(r)
}

/// Convert the lower single-precision (32-bit) floating-point element in b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_cvtss_sd&expand=1896)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtss2sd))]
pub unsafe fn _mm_mask_cvtss_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128) -> __m128d {
    transmute(vcvtss2sd(
        a.as_f64x2(),
        b.as_f32x4(),
        src.as_f64x2(),
        k,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Convert the lower single-precision (32-bit) floating-point element in b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.    
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_cvtss_sd&expand=1897)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtss2sd))]
pub unsafe fn _mm_maskz_cvtss_sd(k: __mmask8, a: __m128d, b: __m128) -> __m128d {
    transmute(vcvtss2sd(
        a.as_f64x2(),
        b.as_f32x4(),
        _mm_setzero_pd().as_f64x2(),
        k,
        _MM_FROUND_CUR_DIRECTION,
    ))
}

/// Convert the lower double-precision (64-bit) floating-point element in b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_cvtsd_ss&expand=1797)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtsd2ss))]
pub unsafe fn _mm_mask_cvtsd_ss(src: __m128, k: __mmask8, a: __m128, b: __m128d) -> __m128 {
    transmute(vcvtsd2ss(
        a.as_f32x4(),
        b.as_f64x2(),
        src.as_f32x4(),
        k,
        _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
    ))
}

/// Convert the lower double-precision (64-bit) floating-point element in b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_cvtsd_ss&expand=1798)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtsd2ss))]
pub unsafe fn _mm_maskz_cvtsd_ss(k: __mmask8, a: __m128, b: __m128d) -> __m128 {
    transmute(vcvtsd2ss(
        a.as_f32x4(),
        b.as_f64x2(),
        _mm_setzero_ps().as_f32x4(),
        k,
        _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
    ))
}

/// Convert the lower single-precision (32-bit) floating-point element in b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///    
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvt_roundss_sd&expand=1371)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtss2sd, SAE = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm_cvt_roundss_sd<const SAE: i32>(a: __m128d, b: __m128) -> __m128d {
    static_assert_sae!(SAE);
    let a = a.as_f64x2();
    let b = b.as_f32x4();
    let zero = _mm_setzero_pd().as_f64x2();
    let r = vcvtss2sd(a, b, zero, 0b11111111, SAE);
    transmute(r)
}

/// Convert the lower single-precision (32-bit) floating-point element in b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///    
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_cvt_roundss_sd&expand=1372)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtss2sd, SAE = 8))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_mask_cvt_roundss_sd<const SAE: i32>(
    src: __m128d,
    k: __mmask8,
    a: __m128d,
    b: __m128,
) -> __m128d {
    static_assert_sae!(SAE);
    let a = a.as_f64x2();
    let b = b.as_f32x4();
    let src = src.as_f64x2();
    let r = vcvtss2sd(a, b, src, k, SAE);
    transmute(r)
}

/// Convert the lower single-precision (32-bit) floating-point element in b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///    
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_cvt_roundss_sd&expand=1373)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtss2sd, SAE = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm_maskz_cvt_roundss_sd<const SAE: i32>(
    k: __mmask8,
    a: __m128d,
    b: __m128,
) -> __m128d {
    static_assert_sae!(SAE);
    let a = a.as_f64x2();
    let b = b.as_f32x4();
    let zero = _mm_setzero_pd().as_f64x2();
    let r = vcvtss2sd(a, b, zero, k, SAE);
    transmute(r)
}

/// Convert the lower double-precision (64-bit) floating-point element in b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvt_roundsd_ss&expand=1361)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtsd2ss, ROUNDING = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm_cvt_roundsd_ss<const ROUNDING: i32>(a: __m128, b: __m128d) -> __m128 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x4();
    let b = b.as_f64x2();
    let zero = _mm_setzero_ps().as_f32x4();
    let r = vcvtsd2ss(a, b, zero, 0b11111111, ROUNDING);
    transmute(r)
}

/// Convert the lower double-precision (64-bit) floating-point element in b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mask_cvt_roundsd_ss&expand=1362)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtsd2ss, ROUNDING = 8))]
#[rustc_legacy_const_generics(4)]
pub unsafe fn _mm_mask_cvt_roundsd_ss<const ROUNDING: i32>(
    src: __m128,
    k: __mmask8,
    a: __m128,
    b: __m128d,
) -> __m128 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x4();
    let b = b.as_f64x2();
    let src = src.as_f32x4();
    let r = vcvtsd2ss(a, b, src, k, ROUNDING);
    transmute(r)
}

/// Convert the lower double-precision (64-bit) floating-point element in b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_maskz_cvt_roundsd_ss&expand=1363)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtsd2ss, ROUNDING = 8))]
#[rustc_legacy_const_generics(3)]
pub unsafe fn _mm_maskz_cvt_roundsd_ss<const ROUNDING: i32>(
    k: __mmask8,
    a: __m128,
    b: __m128d,
) -> __m128 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x4();
    let b = b.as_f64x2();
    let zero = _mm_setzero_ps().as_f32x4();
    let r = vcvtsd2ss(a, b, zero, k, ROUNDING);
    transmute(r)
}

/// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer, and store the result in dst.\
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvt_roundss_si32&expand=1374)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtss2si, ROUNDING = 8))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm_cvt_roundss_si32<const ROUNDING: i32>(a: __m128) -> i32 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x4();
    let r = vcvtss2si(a, ROUNDING);
    transmute(r)
}

/// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer, and store the result in dst.\
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvt_roundss_i32&expand=1369)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtss2si, ROUNDING = 8))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm_cvt_roundss_i32<const ROUNDING: i32>(a: __m128) -> i32 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x4();
    let r = vcvtss2si(a, ROUNDING);
    transmute(r)
}

/// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.\
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvt_roundss_u32&expand=1376)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtss2usi, ROUNDING = 8))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm_cvt_roundss_u32<const ROUNDING: i32>(a: __m128) -> u32 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x4();
    let r = vcvtss2usi(a, ROUNDING);
    transmute(r)
}

/// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer, and store the result in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvtss_i32&expand=1893)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtss2si))]
pub unsafe fn _mm_cvtss_i32(a: __m128) -> i32 {
    transmute(vcvtss2si(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION))
}

/// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvtss_u32&expand=1901)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtss2usi))]
pub unsafe fn _mm_cvtss_u32(a: __m128) -> u32 {
    transmute(vcvtss2usi(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION))
}

/// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer, and store the result in dst.\
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvt_roundsd_si32&expand=1359)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtsd2si, ROUNDING = 8))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm_cvt_roundsd_si32<const ROUNDING: i32>(a: __m128d) -> i32 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f64x2();
    let r = vcvtsd2si(a, ROUNDING);
    transmute(r)
}

/// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer, and store the result in dst.\
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvt_roundsd_i32&expand=1357)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtsd2si, ROUNDING = 8))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm_cvt_roundsd_i32<const ROUNDING: i32>(a: __m128d) -> i32 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f64x2();
    let r = vcvtsd2si(a, ROUNDING);
    transmute(r)
}

/// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.\
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=cvt_roundsd_u32&expand=1364)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtsd2usi, ROUNDING = 8))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm_cvt_roundsd_u32<const ROUNDING: i32>(a: __m128d) -> u32 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f64x2();
    let r = vcvtsd2usi(a, ROUNDING);
    transmute(r)
}

/// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer, and store the result in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvtsd_i32&expand=1791)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtsd2si))]
pub unsafe fn _mm_cvtsd_i32(a: __m128d) -> i32 {
    transmute(vcvtsd2si(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION))
}

/// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer, and store the result in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvtsd_u32&expand=1799)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtsd2usi))]
pub unsafe fn _mm_cvtsd_u32(a: __m128d) -> u32 {
    transmute(vcvtsd2usi(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION))
}

/// Convert the signed 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvt_roundi32_ss&expand=1312)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtsi2ss, ROUNDING = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm_cvt_roundi32_ss<const ROUNDING: i32>(a: __m128, b: i32) -> __m128 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x4();
    let r = vcvtsi2ss(a, b, ROUNDING);
    transmute(r)
}

/// Convert the signed 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
///
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvt_roundsi32_ss&expand=1366)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtsi2ss, ROUNDING = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm_cvt_roundsi32_ss<const ROUNDING: i32>(a: __m128, b: i32) -> __m128 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x4();
    let r = vcvtsi2ss(a, b, ROUNDING);
    transmute(r)
}

/// Convert the unsigned 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
/// Rounding is done according to the rounding\[3:0\] parameter, which can be one of:\
///    (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and suppress exceptions\
///    (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and suppress exceptions\
///    (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress exceptions\
///    (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress exceptions\
///    _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see _MM_SET_ROUNDING_MODE
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvt_roundu32_ss&expand=1378)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtusi2ss, ROUNDING = 8))]
#[rustc_legacy_const_generics(2)]
pub unsafe fn _mm_cvt_roundu32_ss<const ROUNDING: i32>(a: __m128, b: u32) -> __m128 {
    static_assert_rounding!(ROUNDING);
    let a = a.as_f32x4();
    let r = vcvtusi2ss(a, b, ROUNDING);
    transmute(r)
}

/// Convert the signed 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvti32_ss&expand=1643)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtsi2ss))]
pub unsafe fn _mm_cvti32_ss(a: __m128, b: i32) -> __m128 {
    let b = b as f32;
    let r = simd_insert(a, 0, b);
    transmute(r)
}

/// Convert the signed 32-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvti32_sd&expand=1642)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtsi2sd))]
pub unsafe fn _mm_cvti32_sd(a: __m128d, b: i32) -> __m128d {
    let b = b as f64;
    let r = simd_insert(a, 0, b);
    transmute(r)
}

/// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvtt_roundss_Si32&expand=1936)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtss2si, SAE = 8))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm_cvtt_roundss_si32<const SAE: i32>(a: __m128) -> i32 {
    static_assert_sae!(SAE);
    let a = a.as_f32x4();
    let r = vcvtss2si(a, SAE);
    transmute(r)
}

/// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvtt_roundss_i32&expand=1934)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtss2si, SAE = 8))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm_cvtt_roundss_i32<const SAE: i32>(a: __m128) -> i32 {
    static_assert_sae!(SAE);
    let a = a.as_f32x4();
    let r = vcvtss2si(a, SAE);
    transmute(r)
}

/// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvtt_roundss_u32&expand=1938)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtss2usi, SAE = 8))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm_cvtt_roundss_u32<const SAE: i32>(a: __m128) -> u32 {
    static_assert_sae!(SAE);
    let a = a.as_f32x4();
    let r = vcvtss2usi(a, SAE);
    transmute(r)
}

/// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_i32&expand=2022)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtss2si))]
pub unsafe fn _mm_cvttss_i32(a: __m128) -> i32 {
    transmute(vcvtss2si(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION))
}

/// Convert the lower single-precision (32-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_u32&expand=2026)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtss2usi))]
pub unsafe fn _mm_cvttss_u32(a: __m128) -> u32 {
    transmute(vcvtss2usi(a.as_f32x4(), _MM_FROUND_CUR_DIRECTION))
}

/// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundsd_si32&expand=1930)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtsd2si, SAE = 8))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm_cvtt_roundsd_si32<const SAE: i32>(a: __m128d) -> i32 {
    static_assert_sae!(SAE);
    let a = a.as_f64x2();
    let r = vcvtsd2si(a, SAE);
    transmute(r)
}

/// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_roundsd_i32&expand=1928)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtsd2si, SAE = 8))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm_cvtt_roundsd_i32<const SAE: i32>(a: __m128d) -> i32 {
    static_assert_sae!(SAE);
    let a = a.as_f64x2();
    let r = vcvtsd2si(a, SAE);
    transmute(r)
}

/// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_cvtt_roundsd_u32&expand=1932)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtsd2usi, SAE = 8))]
#[rustc_legacy_const_generics(1)]
pub unsafe fn _mm_cvtt_roundsd_u32<const SAE: i32>(a: __m128d) -> u32 {
    static_assert_sae!(SAE);
    let a = a.as_f64x2();
    let r = vcvtsd2usi(a, SAE);
    transmute(r)
}

/// Convert the lower double-precision (64-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_i32&expand=2015)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtsd2si))]
pub unsafe fn _mm_cvttsd_i32(a: __m128d) -> i32 {
    transmute(vcvtsd2si(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION))
}

/// Convert the lower double-precision (64-bit) floating-point element in a to an unsigned 32-bit integer with truncation, and store the result in dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_u32&expand=2020)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtsd2usi))]
pub unsafe fn _mm_cvttsd_u32(a: __m128d) -> u32 {
    transmute(vcvtsd2usi(a.as_f64x2(), _MM_FROUND_CUR_DIRECTION))
}

/// Convert the unsigned 32-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtu32_ss&expand=2032)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtusi2ss))]
pub unsafe fn _mm_cvtu32_ss(a: __m128, b: u32) -> __m128 {
    let b = b as f32;
    let r = simd_insert(a, 0, b);
    transmute(r)
}

/// Convert the unsigned 32-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtu32_sd&expand=2031)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcvtusi2sd))]
pub unsafe fn _mm_cvtu32_sd(a: __m128d, b: u32) -> __m128d {
    let b = b as f64;
    let r = simd_insert(a, 0, b);
    transmute(r)
}

/// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and return the boolean result (0 or 1).\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comi_round_ss&expand=1175)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcmp, IMM5 = 5, SAE = 4))] //should be vcomiss
#[rustc_legacy_const_generics(2, 3)]
pub unsafe fn _mm_comi_round_ss<const IMM5: i32, const SAE: i32>(a: __m128, b: __m128) -> i32 {
    static_assert_imm5!(IMM5);
    static_assert_mantissas_sae!(SAE);
    let a = a.as_f32x4();
    let b = b.as_f32x4();
    let r = vcomiss(a, b, IMM5, SAE);
    transmute(r)
}

/// Compare the lower double-precision (64-bit) floating-point element in a and b based on the comparison operand specified by imm8, and return the boolean result (0 or 1).\
/// Exceptions can be suppressed by passing _MM_FROUND_NO_EXC in the sae parameter.
///
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comi_round_sd&expand=1174)
#[inline]
#[target_feature(enable = "avx512f")]
#[cfg_attr(test, assert_instr(vcmp, IMM5 = 5, SAE = 4))] //should be vcomisd
#[rustc_legacy_const_generics(2, 3)]
pub unsafe fn _mm_comi_round_sd<const IMM5: i32, const SAE: i32>(a: __m128d, b: __m128d) -> i32 {
    static_assert_imm5!(IMM5);
    static_assert_mantissas_sae!(SAE);
    let a = a.as_f64x2();
    let b = b.as_f64x2();
    let r = vcomisd(a, b, IMM5, SAE);
    transmute(r)
}

/// Equal
pub const _MM_CMPINT_EQ: _MM_CMPINT_ENUM = 0x00;
/// Less-than
pub const _MM_CMPINT_LT: _MM_CMPINT_ENUM = 0x01;
/// Less-than-or-equal
pub const _MM_CMPINT_LE: _MM_CMPINT_ENUM = 0x02;
/// False
pub const _MM_CMPINT_FALSE: _MM_CMPINT_ENUM = 0x03;
/// Not-equal
pub const _MM_CMPINT_NE: _MM_CMPINT_ENUM = 0x04;
/// Not less-than
pub const _MM_CMPINT_NLT: _MM_CMPINT_ENUM = 0x05;
/// Not less-than-or-equal
pub const _MM_CMPINT_NLE: _MM_CMPINT_ENUM = 0x06;
/// True
pub const _MM_CMPINT_TRUE: _MM_CMPINT_ENUM = 0x07;

/// interval [1, 2)
pub const _MM_MANT_NORM_1_2: _MM_MANTISSA_NORM_ENUM = 0x00;
/// interval [0.5, 2)
pub const _MM_MANT_NORM_P5_2: _MM_MANTISSA_NORM_ENUM = 0x01;
/// interval [0.5, 1)
pub const _MM_MANT_NORM_P5_1: _MM_MANTISSA_NORM_ENUM = 0x02;
/// interval [0.75, 1.5)
pub const _MM_MANT_NORM_P75_1P5: _MM_MANTISSA_NORM_ENUM = 0x03;

/// sign = sign(SRC)
pub const _MM_MANT_SIGN_SRC: _MM_MANTISSA_SIGN_ENUM = 0x00;
/// sign = 0
pub const _MM_MANT_SIGN_ZERO: _MM_MANTISSA_SIGN_ENUM = 0x01;
/// DEST = NaN if sign(SRC) = 1
pub const _MM_MANT_SIGN_NAN: _MM_MANTISSA_SIGN_ENUM = 0x02;

pub const _MM_PERM_AAAA: _MM_PERM_ENUM = 0x00;
pub const _MM_PERM_AAAB: _MM_PERM_ENUM = 0x01;
pub const _MM_PERM_AAAC: _MM_PERM_ENUM = 0x02;
pub const _MM_PERM_AAAD: _MM_PERM_ENUM = 0x03;
pub const _MM_PERM_AABA: _MM_PERM_ENUM = 0x04;
pub const _MM_PERM_AABB: _MM_PERM_ENUM = 0x05;
pub const _MM_PERM_AABC: _MM_PERM_ENUM = 0x06;
pub const _MM_PERM_AABD: _MM_PERM_ENUM = 0x07;
pub const _MM_PERM_AACA: _MM_PERM_ENUM = 0x08;
pub const _MM_PERM_AACB: _MM_PERM_ENUM = 0x09;
pub const _MM_PERM_AACC: _MM_PERM_ENUM = 0x0A;
pub const _MM_PERM_AACD: _MM_PERM_ENUM = 0x0B;
pub const _MM_PERM_AADA: _MM_PERM_ENUM = 0x0C;
pub const _MM_PERM_AADB: _MM_PERM_ENUM = 0x0D;
pub const _MM_PERM_AADC: _MM_PERM_ENUM = 0x0E;
pub const _MM_PERM_AADD: _MM_PERM_ENUM = 0x0F;
pub const _MM_PERM_ABAA: _MM_PERM_ENUM = 0x10;
pub const _MM_PERM_ABAB: _MM_PERM_ENUM = 0x11;
pub const _MM_PERM_ABAC: _MM_PERM_ENUM = 0x12;
pub const _MM_PERM_ABAD: _MM_PERM_ENUM = 0x13;
pub const _MM_PERM_ABBA: _MM_PERM_ENUM = 0x14;
pub const _MM_PERM_ABBB: _MM_PERM_ENUM = 0x15;
pub const _MM_PERM_ABBC: _MM_PERM_ENUM = 0x16;
pub const _MM_PERM_ABBD: _MM_PERM_ENUM = 0x17;
pub const _MM_PERM_ABCA: _MM_PERM_ENUM = 0x18;
pub const _MM_PERM_ABCB: _MM_PERM_ENUM = 0x19;
pub const _MM_PERM_ABCC: _MM_PERM_ENUM = 0x1A;
pub const _MM_PERM_ABCD: _MM_PERM_ENUM = 0x1B;
pub const _MM_PERM_ABDA: _MM_PERM_ENUM = 0x1C;
pub const _MM_PERM_ABDB: _MM_PERM_ENUM = 0x1D;
pub const _MM_PERM_ABDC: _MM_PERM_ENUM = 0x1E;
pub const _MM_PERM_ABDD: _MM_PERM_ENUM = 0x1F;
pub const _MM_PERM_ACAA: _MM_PERM_ENUM = 0x20;
pub const _MM_PERM_ACAB: _MM_PERM_ENUM = 0x21;
pub const _MM_PERM_ACAC: _MM_PERM_ENUM = 0x22;
pub const _MM_PERM_ACAD: _MM_PERM_ENUM = 0x23;
pub const _MM_PERM_ACBA: _MM_PERM_ENUM = 0x24;
pub const _MM_PERM_ACBB: _MM_PERM_ENUM = 0x25;
pub const _MM_PERM_ACBC: _MM_PERM_ENUM = 0x26;
pub const _MM_PERM_ACBD: _MM_PERM_ENUM = 0x27;
pub const _MM_PERM_ACCA: _MM_PERM_ENUM = 0x28;
pub const _MM_PERM_ACCB: _MM_PERM_ENUM = 0x29;
pub const _MM_PERM_ACCC: _MM_PERM_ENUM = 0x2A;
pub const _MM_PERM_ACCD: _MM_PERM_ENUM = 0x2B;
pub const _MM_PERM_ACDA: _MM_PERM_ENUM = 0x2C;
pub const _MM_PERM_ACDB: _MM_PERM_ENUM = 0x2D;
pub const _MM_PERM_ACDC: _MM_PERM_ENUM = 0x2E;
pub const _MM_PERM_ACDD: _MM_PERM_ENUM = 0x2F;
pub const _MM_PERM_ADAA: _MM_PERM_ENUM = 0x30;
pub const _MM_PERM_ADAB: _MM_PERM_ENUM = 0x31;
pub const _MM_PERM_ADAC: _MM_PERM_ENUM = 0x32;
pub const _MM_PERM_ADAD: _MM_PERM_ENUM = 0x33;
pub const _MM_PERM_ADBA: _MM_PERM_ENUM = 0x34;
pub const _MM_PERM_ADBB: _MM_PERM_ENUM = 0x35;
pub const _MM_PERM_ADBC: _MM_PERM_ENUM = 0x36;
pub const _MM_PERM_ADBD: _MM_PERM_ENUM = 0x37;
pub const _MM_PERM_ADCA: _MM_PERM_ENUM = 0x38;
pub const _MM_PERM_ADCB: _MM_PERM_ENUM = 0x39;
pub const _MM_PERM_ADCC: _MM_PERM_ENUM = 0x3A;
pub const _MM_PERM_ADCD: _MM_PERM_ENUM = 0x3B;
pub const _MM_PERM_ADDA: _MM_PERM_ENUM = 0x3C;
pub const _MM_PERM_ADDB: _MM_PERM_ENUM = 0x3D;
pub const _MM_PERM_ADDC: _MM_PERM_ENUM = 0x3E;
pub const _MM_PERM_ADDD: _MM_PERM_ENUM = 0x3F;
pub const _MM_PERM_BAAA: _MM_PERM_ENUM = 0x40;
pub const _MM_PERM_BAAB: _MM_PERM_ENUM = 0x41;
pub const _MM_PERM_BAAC: _MM_PERM_ENUM = 0x42;
pub const _MM_PERM_BAAD: _MM_PERM_ENUM = 0x43;
pub const _MM_PERM_BABA: _MM_PERM_ENUM = 0x44;
pub const _MM_PERM_BABB: _MM_PERM_ENUM = 0x45;
pub const _MM_PERM_BABC: _MM_PERM_ENUM = 0x46;
pub const _MM_PERM_BABD: _MM_PERM_ENUM = 0x47;
pub const _MM_PERM_BACA: _MM_PERM_ENUM = 0x48;
pub const _MM_PERM_BACB: _MM_PERM_ENUM = 0x49;
pub const _MM_PERM_BACC: _MM_PERM_ENUM = 0x4A;
pub const _MM_PERM_BACD: _MM_PERM_ENUM = 0x4B;
pub const _MM_PERM_BADA: _MM_PERM_ENUM = 0x4C;
pub const _MM_PERM_BADB: _MM_PERM_ENUM = 0x4D;
pub const _MM_PERM_BADC: _MM_PERM_ENUM = 0x4E;
pub const _MM_PERM_BADD: _MM_PERM_ENUM = 0x4F;
pub const _MM_PERM_BBAA: _MM_PERM_ENUM = 0x50;
pub const _MM_PERM_BBAB: _MM_PERM_ENUM = 0x51;
pub const _MM_PERM_BBAC: _MM_PERM_ENUM = 0x52;
pub const _MM_PERM_BBAD: _MM_PERM_ENUM = 0x53;
pub const _MM_PERM_BBBA: _MM_PERM_ENUM = 0x54;
pub const _MM_PERM_BBBB: _MM_PERM_ENUM = 0x55;
pub const _MM_PERM_BBBC: _MM_PERM_ENUM = 0x56;
pub const _MM_PERM_BBBD: _MM_PERM_ENUM = 0x57;
pub const _MM_PERM_BBCA: _MM_PERM_ENUM = 0x58;
pub const _MM_PERM_BBCB: _MM_PERM_ENUM = 0x59;
pub const _MM_PERM_BBCC: _MM_PERM_ENUM = 0x5A;
pub const _MM_PERM_BBCD: _MM_PERM_ENUM = 0x5B;
pub const _MM_PERM_BBDA: _MM_PERM_ENUM = 0x5C;
pub const _MM_PERM_BBDB: _MM_PERM_ENUM = 0x5D;
pub const _MM_PERM_BBDC: _MM_PERM_ENUM = 0x5E;
pub const _MM_PERM_BBDD: _MM_PERM_ENUM = 0x5F;
pub const _MM_PERM_BCAA: _MM_PERM_ENUM = 0x60;
pub const _MM_PERM_BCAB: _MM_PERM_ENUM = 0x61;
pub const _MM_PERM_BCAC: _MM_PERM_ENUM = 0x62;
pub const _MM_PERM_BCAD: _MM_PERM_ENUM = 0x63;
pub const _MM_PERM_BCBA: _MM_PERM_ENUM = 0x64;
pub const _MM_PERM_BCBB: _MM_PERM_ENUM = 0x65;
pub const _MM_PERM_BCBC: _MM_PERM_ENUM = 0x66;
pub const _MM_PERM_BCBD: _MM_PERM_ENUM = 0x67;
pub const _MM_PERM_BCCA: _MM_PERM_ENUM = 0x68;
pub const _MM_PERM_BCCB: _MM_PERM_ENUM = 0x69;
pub const _MM_PERM_BCCC: _MM_PERM_ENUM = 0x6A;
pub const _MM_PERM_BCCD: _MM_PERM_ENUM = 0x6B;
pub const _MM_PERM_BCDA: _MM_PERM_ENUM = 0x6C;
pub const _MM_PERM_BCDB: _MM_PERM_ENUM = 0x6D;
pub const _MM_PERM_BCDC: _MM_PERM_ENUM = 0x6E;
pub const _MM_PERM_BCDD: _MM_PERM_ENUM = 0x6F;
pub const _MM_PERM_BDAA: _MM_PERM_ENUM = 0x70;
pub const _MM_PERM_BDAB: _MM_PERM_ENUM = 0x71;
pub const _MM_PERM_BDAC: _MM_PERM_ENUM = 0x72;
pub const _MM_PERM_BDAD: _MM_PERM_ENUM = 0x73;
pub const _MM_PERM_BDBA: _MM_PERM_ENUM = 0x74;
pub const _MM_PERM_BDBB: _MM_PERM_ENUM = 0x75;
pub const _MM_PERM_BDBC: _MM_PERM_ENUM = 0x76;
pub const _MM_PERM_BDBD: _MM_PERM_ENUM = 0x77;
pub const _MM_PERM_BDCA: _MM_PERM_ENUM = 0x78;
pub const _MM_PERM_BDCB: _MM_PERM_ENUM = 0x79;
pub const _MM_PERM_BDCC: _MM_PERM_ENUM = 0x7A;
pub const _MM_PERM_BDCD: _MM_PERM_ENUM = 0x7B;
pub const _MM_PERM_BDDA: _MM_PERM_ENUM = 0x7C;
pub const _MM_PERM_BDDB: _MM_PERM_ENUM = 0x7D;
pub const _MM_PERM_BDDC: _MM_PERM_ENUM = 0x7E;
pub const _MM_PERM_BDDD: _MM_PERM_ENUM = 0x7F;
pub const _MM_PERM_CAAA: _MM_PERM_ENUM = 0x80;
pub const _MM_PERM_CAAB: _MM_PERM_ENUM = 0x81;
pub const _MM_PERM_CAAC: _MM_PERM_ENUM = 0x82;
pub const _MM_PERM_CAAD: _MM_PERM_ENUM = 0x83;
pub const _MM_PERM_CABA: _MM_PERM_ENUM = 0x84;
pub const _MM_PERM_CABB: _MM_PERM_ENUM = 0x85;
pub const _MM_PERM_CABC: _MM_PERM_ENUM = 0x86;
pub const _MM_PERM_CABD: _MM_PERM_ENUM = 0x87;
pub const _MM_PERM_CACA: _MM_PERM_ENUM = 0x88;
pub const _MM_PERM_CACB: _MM_PERM_ENUM = 0x89;
pub const _MM_PERM_CACC: _MM_PERM_ENUM = 0x8A;
pub const _MM_PERM_CACD: _MM_PERM_ENUM = 0x8B;
pub const _MM_PERM_CADA: _MM_PERM_ENUM = 0x8C;
pub const _MM_PERM_CADB: _MM_PERM_ENUM = 0x8D;
pub const _MM_PERM_CADC: _MM_PERM_ENUM = 0x8E;
pub const _MM_PERM_CADD: _MM_PERM_ENUM = 0x8F;
pub const _MM_PERM_CBAA: _MM_PERM_ENUM = 0x90;
pub const _MM_PERM_CBAB: _MM_PERM_ENUM = 0x91;
pub const _MM_PERM_CBAC: _MM_PERM_ENUM = 0x92;
pub const _MM_PERM_CBAD: _MM_PERM_ENUM = 0x93;
pub const _MM_PERM_CBBA: _MM_PERM_ENUM = 0x94;
pub const _MM_PERM_CBBB: _MM_PERM_ENUM = 0x95;
pub const _MM_PERM_CBBC: _MM_PERM_ENUM = 0x96;
pub const _MM_PERM_CBBD: _MM_PERM_ENUM = 0x97;
pub const _MM_PERM_CBCA: _MM_PERM_ENUM = 0x98;
pub const _MM_PERM_CBCB: _MM_PERM_ENUM = 0x99;
pub const _MM_PERM_CBCC: _MM_PERM_ENUM = 0x9A;
pub const _MM_PERM_CBCD: _MM_PERM_ENUM = 0x9B;
pub const _MM_PERM_CBDA: _MM_PERM_ENUM = 0x9C;
pub const _MM_PERM_CBDB: _MM_PERM_ENUM = 0x9D;
pub const _MM_PERM_CBDC: _MM_PERM_ENUM = 0x9E;
pub const _MM_PERM_CBDD: _MM_PERM_ENUM = 0x9F;
pub const _MM_PERM_CCAA: _MM_PERM_ENUM = 0xA0;
pub const _MM_PERM_CCAB: _MM_PERM_ENUM = 0xA1;
pub const _MM_PERM_CCAC: _MM_PERM_ENUM = 0xA2;
pub const _MM_PERM_CCAD: _MM_PERM_ENUM = 0xA3;
pub const _MM_PERM_CCBA: _MM_PERM_ENUM = 0xA4;
pub const _MM_PERM_CCBB: _MM_PERM_ENUM = 0xA5;
pub const _MM_PERM_CCBC: _MM_PERM_ENUM = 0xA6;
pub const _MM_PERM_CCBD: _MM_PERM_ENUM = 0xA7;
pub const _MM_PERM_CCCA: _MM_PERM_ENUM = 0xA8;
pub const _MM_PERM_CCCB: _MM_PERM_ENUM = 0xA9;
pub const _MM_PERM_CCCC: _MM_PERM_ENUM = 0xAA;
pub const _MM_PERM_CCCD: _MM_PERM_ENUM = 0xAB;
pub const _MM_PERM_CCDA: _MM_PERM_ENUM = 0xAC;
pub const _MM_PERM_CCDB: _MM_PERM_ENUM = 0xAD;
pub const _MM_PERM_CCDC: _MM_PERM_ENUM = 0xAE;
pub const _MM_PERM_CCDD: _MM_PERM_ENUM = 0xAF;
pub const _MM_PERM_CDAA: _MM_PERM_ENUM = 0xB0;
pub const _MM_PERM_CDAB: _MM_PERM_ENUM = 0xB1;
pub const _MM_PERM_CDAC: _MM_PERM_ENUM = 0xB2;
pub const _MM_PERM_CDAD: _MM_PERM_ENUM = 0xB3;
pub const _MM_PERM_CDBA: _MM_PERM_ENUM = 0xB4;
pub const _MM_PERM_CDBB: _MM_PERM_ENUM = 0xB5;
pub const _MM_PERM_CDBC: _MM_PERM_ENUM = 0xB6;
pub const _MM_PERM_CDBD: _MM_PERM_ENUM = 0xB7;
pub const _MM_PERM_CDCA: _MM_PERM_ENUM = 0xB8;
pub const _MM_PERM_CDCB: _MM_PERM_ENUM = 0xB9;
pub const _MM_PERM_CDCC: _MM_PERM_ENUM = 0xBA;
pub const _MM_PERM_CDCD: _MM_PERM_ENUM = 0xBB;
pub const _MM_PERM_CDDA: _MM_PERM_ENUM = 0xBC;
pub const _MM_PERM_CDDB: _MM_PERM_ENUM = 0xBD;
pub const _MM_PERM_CDDC: _MM_PERM_ENUM = 0xBE;
pub const _MM_PERM_CDDD: _MM_PERM_ENUM = 0xBF;
pub const _MM_PERM_DAAA: _MM_PERM_ENUM = 0xC0;
pub const _MM_PERM_DAAB: _MM_PERM_ENUM = 0xC1;
pub const _MM_PERM_DAAC: _MM_PERM_ENUM = 0xC2;
pub const _MM_PERM_DAAD: _MM_PERM_ENUM = 0xC3;
pub const _MM_PERM_DABA: _MM_PERM_ENUM = 0xC4;
pub const _MM_PERM_DABB: _MM_PERM_ENUM = 0xC5;
pub const _MM_PERM_DABC: _MM_PERM_ENUM = 0xC6;
pub const _MM_PERM_DABD: _MM_PERM_ENUM = 0xC7;
pub const _MM_PERM_DACA: _MM_PERM_ENUM = 0xC8;
pub const _MM_PERM_DACB: _MM_PERM_ENUM = 0xC9;
pub const _MM_PERM_DACC: _MM_PERM_ENUM = 0xCA;
pub const _MM_PERM_DACD: _MM_PERM_ENUM = 0xCB;
pub const _MM_PERM_DADA: _MM_PERM_ENUM = 0xCC;
pub const _MM_PERM_DADB: _MM_PERM_ENUM = 0xCD;
pub const _MM_PERM_DADC: _MM_PERM_ENUM = 0xCE;
pub const _MM_PERM_DADD: _MM_PERM_ENUM = 0xCF;
pub const _MM_PERM_DBAA: _MM_PERM_ENUM = 0xD0;
pub const _MM_PERM_DBAB: _MM_PERM_ENUM = 0xD1;
pub const _MM_PERM_DBAC: _MM_PERM_ENUM = 0xD2;
pub const _MM_PERM_DBAD: _MM_PERM_ENUM = 0xD3;
pub const _MM_PERM_DBBA: _MM_PERM_ENUM = 0xD4;
pub const _MM_PERM_DBBB: _MM_PERM_ENUM = 0xD5;
pub const _MM_PERM_DBBC: _MM_PERM_ENUM = 0xD6;
pub const _MM_PERM_DBBD: _MM_PERM_ENUM = 0xD7;
pub const _MM_PERM_DBCA: _MM_PERM_ENUM = 0xD8;
pub const _MM_PERM_DBCB: _MM_PERM_ENUM = 0xD9;
pub const _MM_PERM_DBCC: _MM_PERM_ENUM = 0xDA;
pub const _MM_PERM_DBCD: _MM_PERM_ENUM = 0xDB;
pub const _MM_PERM_DBDA: _MM_PERM_ENUM = 0xDC;
pub const _MM_PERM_DBDB: _MM_PERM_ENUM = 0xDD;
pub const _MM_PERM_DBDC: _MM_PERM_ENUM = 0xDE;
pub const _MM_PERM_DBDD: _MM_PERM_ENUM = 0xDF;
pub const _MM_PERM_DCAA: _MM_PERM_ENUM = 0xE0;
pub const _MM_PERM_DCAB: _MM_PERM_ENUM = 0xE1;
pub const _MM_PERM_DCAC: _MM_PERM_ENUM = 0xE2;
pub const _MM_PERM_DCAD: _MM_PERM_ENUM = 0xE3;
pub const _MM_PERM_DCBA: _MM_PERM_ENUM = 0xE4;
pub const _MM_PERM_DCBB: _MM_PERM_ENUM = 0xE5;
pub const _MM_PERM_DCBC: _MM_PERM_ENUM = 0xE6;
pub const _MM_PERM_DCBD: _MM_PERM_ENUM = 0xE7;
pub const _MM_PERM_DCCA: _MM_PERM_ENUM = 0xE8;
pub const _MM_PERM_DCCB: _MM_PERM_ENUM = 0xE9;
pub const _MM_PERM_DCCC: _MM_PERM_ENUM = 0xEA;
pub const _MM_PERM_DCCD: _MM_PERM_ENUM = 0xEB;
pub const _MM_PERM_DCDA: _MM_PERM_ENUM = 0xEC;
pub const _MM_PERM_DCDB: _MM_PERM_ENUM = 0xED;
pub const _MM_PERM_DCDC: _MM_PERM_ENUM = 0xEE;
pub const _MM_PERM_DCDD: _MM_PERM_ENUM = 0xEF;
pub const _MM_PERM_DDAA: _MM_PERM_ENUM = 0xF0;
pub const _MM_PERM_DDAB: _MM_PERM_ENUM = 0xF1;
pub const _MM_PERM_DDAC: _MM_PERM_ENUM = 0xF2;
pub const _MM_PERM_DDAD: _MM_PERM_ENUM = 0xF3;
pub const _MM_PERM_DDBA: _MM_PERM_ENUM = 0xF4;
pub const _MM_PERM_DDBB: _MM_PERM_ENUM = 0xF5;
pub const _MM_PERM_DDBC: _MM_PERM_ENUM = 0xF6;
pub const _MM_PERM_DDBD: _MM_PERM_ENUM = 0xF7;
pub const _MM_PERM_DDCA: _MM_PERM_ENUM = 0xF8;
pub const _MM_PERM_DDCB: _MM_PERM_ENUM = 0xF9;
pub const _MM_PERM_DDCC: _MM_PERM_ENUM = 0xFA;
pub const _MM_PERM_DDCD: _MM_PERM_ENUM = 0xFB;
pub const _MM_PERM_DDDA: _MM_PERM_ENUM = 0xFC;
pub const _MM_PERM_DDDB: _MM_PERM_ENUM = 0xFD;
pub const _MM_PERM_DDDC: _MM_PERM_ENUM = 0xFE;
pub const _MM_PERM_DDDD: _MM_PERM_ENUM = 0xFF;

#[allow(improper_ctypes)]
extern "C" {
    #[link_name = "llvm.x86.avx512.pmul.dq.512"]
    fn vpmuldq(a: i32x16, b: i32x16) -> i64x8;
    #[link_name = "llvm.x86.avx512.pmulu.dq.512"]
    fn vpmuludq(a: u32x16, b: u32x16) -> u64x8;

    #[link_name = "llvm.x86.avx512.mask.pmaxs.d.512"]
    fn vpmaxsd(a: i32x16, b: i32x16) -> i32x16;

    #[link_name = "llvm.x86.avx512.mask.pmaxs.q.512"]
    fn vpmaxsq(a: i64x8, b: i64x8) -> i64x8;
    #[link_name = "llvm.x86.avx512.mask.pmaxs.q.256"]
    fn vpmaxsq256(a: i64x4, b: i64x4) -> i64x4;
    #[link_name = "llvm.x86.avx512.mask.pmaxs.q.128"]
    fn vpmaxsq128(a: i64x2, b: i64x2) -> i64x2;

    #[link_name = "llvm.x86.avx512.mask.pmins.d.512"]
    fn vpminsd(a: i32x16, b: i32x16) -> i32x16;

    #[link_name = "llvm.x86.avx512.mask.pmins.q.512"]
    fn vpminsq(a: i64x8, b: i64x8) -> i64x8;
    #[link_name = "llvm.x86.avx512.mask.pmins.q.256"]
    fn vpminsq256(a: i64x4, b: i64x4) -> i64x4;
    #[link_name = "llvm.x86.avx512.mask.pmins.q.128"]
    fn vpminsq128(a: i64x2, b: i64x2) -> i64x2;

    #[link_name = "llvm.x86.avx512.mask.pmaxu.d.512"]
    fn vpmaxud(a: u32x16, b: u32x16) -> u32x16;

    #[link_name = "llvm.x86.avx512.mask.pmaxu.q.512"]
    fn vpmaxuq(a: u64x8, b: u64x8) -> u64x8;
    #[link_name = "llvm.x86.avx512.mask.pmaxu.q.256"]
    fn vpmaxuq256(a: u64x4, b: u64x4) -> u64x4;
    #[link_name = "llvm.x86.avx512.mask.pmaxu.q.128"]
    fn vpmaxuq128(a: u64x2, b: u64x2) -> u64x2;

    #[link_name = "llvm.x86.avx512.mask.pminu.d.512"]
    fn vpminud(a: u32x16, b: u32x16) -> u32x16;

    #[link_name = "llvm.x86.avx512.mask.pminu.q.512"]
    fn vpminuq(a: u64x8, b: u64x8) -> u64x8;
    #[link_name = "llvm.x86.avx512.mask.pminu.q.256"]
    fn vpminuq256(a: u64x4, b: u64x4) -> u64x4;
    #[link_name = "llvm.x86.avx512.mask.pminu.q.128"]
    fn vpminuq128(a: u64x2, b: u64x2) -> u64x2;

    #[link_name = "llvm.x86.avx512.sqrt.ps.512"]
    fn vsqrtps(a: f32x16, rounding: i32) -> f32x16;
    #[link_name = "llvm.x86.avx512.sqrt.pd.512"]
    fn vsqrtpd(a: f64x8, rounding: i32) -> f64x8;

    #[link_name = "llvm.fma.v16f32"]
    fn vfmadd132ps(a: f32x16, b: f32x16, c: f32x16) -> f32x16;
    #[link_name = "llvm.fma.v8f64"]
    fn vfmadd132pd(a: f64x8, b: f64x8, c: f64x8) -> f64x8;

    #[link_name = "llvm.x86.avx512.vfmadd.ps.512"]
    fn vfmadd132psround(a: f32x16, b: f32x16, c: f32x16, rounding: i32) -> f32x16;
    #[link_name = "llvm.x86.avx512.vfmadd.pd.512"]
    fn vfmadd132pdround(a: f64x8, b: f64x8, c: f64x8, rounding: i32) -> f64x8;

    #[link_name = "llvm.x86.avx512.vfmaddsub.ps.512"]
    fn vfmaddsub213ps(a: f32x16, b: f32x16, c: f32x16, d: i32) -> f32x16; //from clang
    #[link_name = "llvm.x86.avx512.vfmaddsub.pd.512"]
    fn vfmaddsub213pd(a: f64x8, b: f64x8, c: f64x8, d: i32) -> f64x8; //from clang

    #[link_name = "llvm.x86.avx512.add.ps.512"]
    fn vaddps(a: f32x16, b: f32x16, rounding: i32) -> f32x16;
    #[link_name = "llvm.x86.avx512.add.pd.512"]
    fn vaddpd(a: f64x8, b: f64x8, rounding: i32) -> f64x8;
    #[link_name = "llvm.x86.avx512.sub.ps.512"]
    fn vsubps(a: f32x16, b: f32x16, rounding: i32) -> f32x16;
    #[link_name = "llvm.x86.avx512.sub.pd.512"]
    fn vsubpd(a: f64x8, b: f64x8, rounding: i32) -> f64x8;
    #[link_name = "llvm.x86.avx512.mul.ps.512"]
    fn vmulps(a: f32x16, b: f32x16, rounding: i32) -> f32x16;
    #[link_name = "llvm.x86.avx512.mul.pd.512"]
    fn vmulpd(a: f64x8, b: f64x8, rounding: i32) -> f64x8;
    #[link_name = "llvm.x86.avx512.div.ps.512"]
    fn vdivps(a: f32x16, b: f32x16, rounding: i32) -> f32x16;
    #[link_name = "llvm.x86.avx512.div.pd.512"]
    fn vdivpd(a: f64x8, b: f64x8, rounding: i32) -> f64x8;

    #[link_name = "llvm.x86.avx512.max.ps.512"]
    fn vmaxps(a: f32x16, b: f32x16, sae: i32) -> f32x16;
    #[link_name = "llvm.x86.avx512.max.pd.512"]
    fn vmaxpd(a: f64x8, b: f64x8, sae: i32) -> f64x8;
    #[link_name = "llvm.x86.avx512.min.ps.512"]
    fn vminps(a: f32x16, b: f32x16, sae: i32) -> f32x16;
    #[link_name = "llvm.x86.avx512.min.pd.512"]
    fn vminpd(a: f64x8, b: f64x8, sae: i32) -> f64x8;

    #[link_name = "llvm.x86.avx512.mask.getexp.ps.512"]
    fn vgetexpps(a: f32x16, src: f32x16, m: u16, sae: i32) -> f32x16;

    #[link_name = "llvm.x86.avx512.mask.getexp.ps.256"]
    fn vgetexpps256(a: f32x8, src: f32x8, m: u8) -> f32x8;
    #[link_name = "llvm.x86.avx512.mask.getexp.ps.128"]
    fn vgetexpps128(a: f32x4, src: f32x4, m: u8) -> f32x4;

    #[link_name = "llvm.x86.avx512.mask.getexp.pd.512"]
    fn vgetexppd(a: f64x8, src: f64x8, m: u8, sae: i32) -> f64x8;
    #[link_name = "llvm.x86.avx512.mask.getexp.pd.256"]
    fn vgetexppd256(a: f64x4, src: f64x4, m: u8) -> f64x4;
    #[link_name = "llvm.x86.avx512.mask.getexp.pd.128"]
    fn vgetexppd128(a: f64x2, src: f64x2, m: u8) -> f64x2;

    #[link_name = "llvm.x86.avx512.mask.rndscale.ps.512"]
    fn vrndscaleps(a: f32x16, imm8: i32, src: f32x16, mask: u16, sae: i32) -> f32x16;
    #[link_name = "llvm.x86.avx512.mask.rndscale.ps.256"]
    fn vrndscaleps256(a: f32x8, imm8: i32, src: f32x8, mask: u8) -> f32x8;
    #[link_name = "llvm.x86.avx512.mask.rndscale.ps.128"]
    fn vrndscaleps128(a: f32x4, imm8: i32, src: f32x4, mask: u8) -> f32x4;

    #[link_name = "llvm.x86.avx512.mask.rndscale.pd.512"]
    fn vrndscalepd(a: f64x8, imm8: i32, src: f64x8, mask: u8, sae: i32) -> f64x8;
    #[link_name = "llvm.x86.avx512.mask.rndscale.pd.256"]
    fn vrndscalepd256(a: f64x4, imm8: i32, src: f64x4, mask: u8) -> f64x4;
    #[link_name = "llvm.x86.avx512.mask.rndscale.pd.128"]
    fn vrndscalepd128(a: f64x2, imm8: i32, src: f64x2, mask: u8) -> f64x2;

    #[link_name = "llvm.x86.avx512.mask.scalef.ps.512"]
    fn vscalefps(a: f32x16, b: f32x16, src: f32x16, mask: u16, rounding: i32) -> f32x16;
    #[link_name = "llvm.x86.avx512.mask.scalef.ps.256"]
    fn vscalefps256(a: f32x8, b: f32x8, src: f32x8, mask: u8) -> f32x8;
    #[link_name = "llvm.x86.avx512.mask.scalef.ps.128"]
    fn vscalefps128(a: f32x4, b: f32x4, src: f32x4, mask: u8) -> f32x4;

    #[link_name = "llvm.x86.avx512.mask.scalef.pd.512"]
    fn vscalefpd(a: f64x8, b: f64x8, src: f64x8, mask: u8, rounding: i32) -> f64x8;
    #[link_name = "llvm.x86.avx512.mask.scalef.pd.256"]
    fn vscalefpd256(a: f64x4, b: f64x4, src: f64x4, mask: u8) -> f64x4;
    #[link_name = "llvm.x86.avx512.mask.scalef.pd.128"]
    fn vscalefpd128(a: f64x2, b: f64x2, src: f64x2, mask: u8) -> f64x2;

    #[link_name = "llvm.x86.avx512.mask.fixupimm.ps.512"]
    fn vfixupimmps(a: f32x16, b: f32x16, c: i32x16, imm8: i32, mask: u16, sae: i32) -> f32x16;
    #[link_name = "llvm.x86.avx512.mask.fixupimm.ps.256"]
    fn vfixupimmps256(a: f32x8, b: f32x8, c: i32x8, imm8: i32, mask: u8) -> f32x8;
    #[link_name = "llvm.x86.avx512.mask.fixupimm.ps.128"]
    fn vfixupimmps128(a: f32x4, b: f32x4, c: i32x4, imm8: i32, mask: u8) -> f32x4;

    #[link_name = "llvm.x86.avx512.mask.fixupimm.pd.512"]
    fn vfixupimmpd(a: f64x8, b: f64x8, c: i64x8, imm8: i32, mask: u8, sae: i32) -> f64x8;
    #[link_name = "llvm.x86.avx512.mask.fixupimm.pd.256"]
    fn vfixupimmpd256(a: f64x4, b: f64x4, c: i64x4, imm8: i32, mask: u8) -> f64x4;
    #[link_name = "llvm.x86.avx512.mask.fixupimm.pd.128"]
    fn vfixupimmpd128(a: f64x2, b: f64x2, c: i64x2, imm8: i32, mask: u8) -> f64x2;

    #[link_name = "llvm.x86.avx512.maskz.fixupimm.ps.512"]
    fn vfixupimmpsz(a: f32x16, b: f32x16, c: i32x16, imm8: i32, mask: u16, sae: i32) -> f32x16;
    #[link_name = "llvm.x86.avx512.maskz.fixupimm.ps.256"]
    fn vfixupimmpsz256(a: f32x8, b: f32x8, c: i32x8, imm8: i32, mask: u8) -> f32x8;
    #[link_name = "llvm.x86.avx512.maskz.fixupimm.ps.128"]
    fn vfixupimmpsz128(a: f32x4, b: f32x4, c: i32x4, imm8: i32, mask: u8) -> f32x4;

    #[link_name = "llvm.x86.avx512.maskz.fixupimm.pd.512"]
    fn vfixupimmpdz(a: f64x8, b: f64x8, c: i64x8, imm8: i32, mask: u8, sae: i32) -> f64x8;
    #[link_name = "llvm.x86.avx512.maskz.fixupimm.pd.256"]
    fn vfixupimmpdz256(a: f64x4, b: f64x4, c: i64x4, imm8: i32, mask: u8) -> f64x4;
    #[link_name = "llvm.x86.avx512.maskz.fixupimm.pd.128"]
    fn vfixupimmpdz128(a: f64x2, b: f64x2, c: i64x2, imm8: i32, mask: u8) -> f64x2;

    #[link_name = "llvm.x86.avx512.pternlog.d.512"]
    fn vpternlogd(a: i32x16, b: i32x16, c: i32x16, imm8: i32) -> i32x16;
    #[link_name = "llvm.x86.avx512.pternlog.d.256"]
    fn vpternlogd256(a: i32x8, b: i32x8, c: i32x8, imm8: i32) -> i32x8;
    #[link_name = "llvm.x86.avx512.pternlog.d.128"]
    fn vpternlogd128(a: i32x4, b: i32x4, c: i32x4, imm8: i32) -> i32x4;

    #[link_name = "llvm.x86.avx512.pternlog.q.512"]
    fn vpternlogq(a: i64x8, b: i64x8, c: i64x8, imm8: i32) -> i64x8;
    #[link_name = "llvm.x86.avx512.pternlog.q.256"]
    fn vpternlogq256(a: i64x4, b: i64x4, c: i64x4, imm8: i32) -> i64x4;
    #[link_name = "llvm.x86.avx512.pternlog.q.128"]
    fn vpternlogq128(a: i64x2, b: i64x2, c: i64x2, imm8: i32) -> i64x2;

    #[link_name = "llvm.x86.avx512.mask.getmant.ps.512"]
    fn vgetmantps(a: f32x16, mantissas: i32, src: f32x16, m: u16, sae: i32) -> f32x16;
    #[link_name = "llvm.x86.avx512.mask.getmant.ps.256"]
    fn vgetmantps256(a: f32x8, mantissas: i32, src: f32x8, m: u8) -> f32x8;
    #[link_name = "llvm.x86.avx512.mask.getmant.ps.128"]
    fn vgetmantps128(a: f32x4, mantissas: i32, src: f32x4, m: u8) -> f32x4;

    #[link_name = "llvm.x86.avx512.mask.getmant.pd.512"]
    fn vgetmantpd(a: f64x8, mantissas: i32, src: f64x8, m: u8, sae: i32) -> f64x8;
    #[link_name = "llvm.x86.avx512.mask.getmant.pd.256"]
    fn vgetmantpd256(a: f64x4, mantissas: i32, src: f64x4, m: u8) -> f64x4;
    #[link_name = "llvm.x86.avx512.mask.getmant.pd.128"]
    fn vgetmantpd128(a: f64x2, mantissas: i32, src: f64x2, m: u8) -> f64x2;

    #[link_name = "llvm.x86.avx512.rcp14.ps.512"]
    fn vrcp14ps(a: f32x16, src: f32x16, m: u16) -> f32x16;
    #[link_name = "llvm.x86.avx512.rcp14.ps.256"]
    fn vrcp14ps256(a: f32x8, src: f32x8, m: u8) -> f32x8;
    #[link_name = "llvm.x86.avx512.rcp14.ps.128"]
    fn vrcp14ps128(a: f32x4, src: f32x4, m: u8) -> f32x4;

    #[link_name = "llvm.x86.avx512.rcp14.pd.512"]
    fn vrcp14pd(a: f64x8, src: f64x8, m: u8) -> f64x8;
    #[link_name = "llvm.x86.avx512.rcp14.pd.256"]
    fn vrcp14pd256(a: f64x4, src: f64x4, m: u8) -> f64x4;
    #[link_name = "llvm.x86.avx512.rcp14.pd.128"]
    fn vrcp14pd128(a: f64x2, src: f64x2, m: u8) -> f64x2;

    #[link_name = "llvm.x86.avx512.rsqrt14.ps.512"]
    fn vrsqrt14ps(a: f32x16, src: f32x16, m: u16) -> f32x16;
    #[link_name = "llvm.x86.avx512.rsqrt14.ps.256"]
    fn vrsqrt14ps256(a: f32x8, src: f32x8, m: u8) -> f32x8;
    #[link_name = "llvm.x86.avx512.rsqrt14.ps.128"]
    fn vrsqrt14ps128(a: f32x4, src: f32x4, m: u8) -> f32x4;

    #[link_name = "llvm.x86.avx512.rsqrt14.pd.512"]
    fn vrsqrt14pd(a: f64x8, src: f64x8, m: u8) -> f64x8;
    #[link_name = "llvm.x86.avx512.rsqrt14.pd.256"]
    fn vrsqrt14pd256(a: f64x4, src: f64x4, m: u8) -> f64x4;
    #[link_name = "llvm.x86.avx512.rsqrt14.pd.128"]
    fn vrsqrt14pd128(a: f64x2, src: f64x2, m: u8) -> f64x2;

    #[link_name = "llvm.x86.avx512.mask.cvtps2dq.512"]
    fn vcvtps2dq(a: f32x16, src: i32x16, mask: u16, rounding: i32) -> i32x16;

    #[link_name = "llvm.x86.avx512.mask.cvtps2udq.512"]
    fn vcvtps2udq(a: f32x16, src: u32x16, mask: u16, rounding: i32) -> u32x16;
    #[link_name = "llvm.x86.avx512.mask.cvtps2udq.256"]
    fn vcvtps2udq256(a: f32x8, src: u32x8, mask: u8) -> u32x8;
    #[link_name = "llvm.x86.avx512.mask.cvtps2udq.128"]
    fn vcvtps2udq128(a: f32x4, src: u32x4, mask: u8) -> u32x4;

    #[link_name = "llvm.x86.avx512.mask.cvtps2pd.512"]
    fn vcvtps2pd(a: f32x8, src: f64x8, mask: u8, sae: i32) -> f64x8;
    #[link_name = "llvm.x86.avx512.mask.cvtpd2ps.512"]
    fn vcvtpd2ps(a: f64x8, src: f32x8, mask: u8, rounding: i32) -> f32x8;

    #[link_name = "llvm.x86.avx512.mask.cvtpd2dq.512"]
    fn vcvtpd2dq(a: f64x8, src: i32x8, mask: u8, rounding: i32) -> i32x8;

    #[link_name = "llvm.x86.avx512.mask.cvtpd2udq.512"]
    fn vcvtpd2udq(a: f64x8, src: u32x8, mask: u8, rounding: i32) -> u32x8;
    #[link_name = "llvm.x86.avx512.mask.cvtpd2udq.256"]
    fn vcvtpd2udq256(a: f64x4, src: u32x4, mask: u8) -> u32x4;
    #[link_name = "llvm.x86.avx512.mask.cvtpd2udq.128"]
    fn vcvtpd2udq128(a: f64x2, src: u32x4, mask: u8) -> u32x4;

    #[link_name = "llvm.x86.avx512.sitofp.round.v16f32.v16i32"]
    fn vcvtdq2ps(a: i32x16, rounding: i32) -> f32x16;
    #[link_name = "llvm.x86.avx512.uitofp.round.v16f32.v16i32"]
    fn vcvtudq2ps(a: u32x16, rounding: i32) -> f32x16;

    #[link_name = "llvm.x86.avx512.mask.vcvtps2ph.512"]
    fn vcvtps2ph(a: f32x16, sae: i32, src: i16x16, mask: u16) -> i16x16;
    #[link_name = "llvm.x86.avx512.mask.vcvtps2ph.256"]
    fn vcvtps2ph256(a: f32x8, sae: i32, src: i16x8, mask: u8) -> i16x8;
    #[link_name = "llvm.x86.avx512.mask.vcvtps2ph.128"]
    fn vcvtps2ph128(a: f32x4, sae: i32, src: i16x8, mask: u8) -> i16x8;

    #[link_name = "llvm.x86.avx512.mask.vcvtph2ps.512"]
    fn vcvtph2ps(a: i16x16, src: f32x16, mask: u16, sae: i32) -> f32x16;

    #[link_name = "llvm.x86.avx512.mask.cvttps2dq.512"]
    fn vcvttps2dq(a: f32x16, src: i32x16, mask: u16, rounding: i32) -> i32x16;
    #[link_name = "llvm.x86.avx512.mask.cvttps2dq.256"]
    fn vcvttps2dq256(a: f32x8, src: i32x8, mask: u8) -> i32x8;
    #[link_name = "llvm.x86.avx512.mask.cvttps2dq.128"]
    fn vcvttps2dq128(a: f32x4, src: i32x4, mask: u8) -> i32x4;

    #[link_name = "llvm.x86.avx512.mask.cvttps2udq.512"]
    fn vcvttps2udq(a: f32x16, src: u32x16, mask: u16, rounding: i32) -> u32x16;
    #[link_name = "llvm.x86.avx512.mask.cvttps2udq.256"]
    fn vcvttps2udq256(a: f32x8, src: u32x8, mask: u8) -> u32x8;
    #[link_name = "llvm.x86.avx512.mask.cvttps2udq.128"]
    fn vcvttps2udq128(a: f32x4, src: u32x4, mask: u8) -> u32x4;

    #[link_name = "llvm.x86.avx512.mask.cvttpd2dq.512"]
    fn vcvttpd2dq(a: f64x8, src: i32x8, mask: u8, rounding: i32) -> i32x8;
    #[link_name = "llvm.x86.avx512.mask.cvttpd2dq.256"]
    fn vcvttpd2dq256(a: f64x4, src: i32x4, mask: u8) -> i32x4;
    #[link_name = "llvm.x86.avx512.mask.cvttpd2dq.128"]
    fn vcvttpd2dq128(a: f64x2, src: i32x4, mask: u8) -> i32x4;

    #[link_name = "llvm.x86.avx512.mask.cvttpd2udq.512"]
    fn vcvttpd2udq(a: f64x8, src: i32x8, mask: u8, rounding: i32) -> u32x8;
    #[link_name = "llvm.x86.avx512.mask.cvttpd2udq.256"]
    fn vcvttpd2udq256(a: f64x4, src: i32x4, mask: u8) -> u32x4;
    #[link_name = "llvm.x86.avx512.mask.cvttpd2udq.128"]
    fn vcvttpd2udq128(a: f64x2, src: i32x4, mask: u8) -> u32x4;

    #[link_name = "llvm.x86.avx512.mask.pmov.dw.128"]
    fn vpmovdw128(a: i32x4, src: i16x8, mask: u8) -> i16x8;
    #[link_name = "llvm.x86.avx512.mask.pmov.db.256"]
    fn vpmovdb256(a: i32x8, src: i8x16, mask: u8) -> i8x16;
    #[link_name = "llvm.x86.avx512.mask.pmov.db.128"]
    fn vpmovdb128(a: i32x4, src: i8x16, mask: u8) -> i8x16;

    #[link_name = "llvm.x86.avx512.mask.pmov.qw.256"]
    fn vpmovqw256(a: i64x4, src: i16x8, mask: u8) -> i16x8;
    #[link_name = "llvm.x86.avx512.mask.pmov.qw.128"]
    fn vpmovqw128(a: i64x2, src: i16x8, mask: u8) -> i16x8;
    #[link_name = "llvm.x86.avx512.mask.pmov.qb.256"]
    fn vpmovqb256(a: i64x4, src: i8x16, mask: u8) -> i8x16;
    #[link_name = "llvm.x86.avx512.mask.pmov.qb.128"]
    fn vpmovqb128(a: i64x2, src: i8x16, mask: u8) -> i8x16;
    #[link_name = "llvm.x86.avx512.mask.pmov.qd.128"]
    fn vpmovqd128(a: i64x2, src: i32x4, mask: u8) -> i32x4;

    #[link_name = "llvm.x86.avx512.mask.pmov.dw.mem.512"]
    fn vpmovdwmem(mem_addr: *mut i8, a: i32x16, mask: u16);
    #[link_name = "llvm.x86.avx512.mask.pmov.dw.mem.256"]
    fn vpmovdwmem256(mem_addr: *mut i8, a: i32x8, mask: u8);
    #[link_name = "llvm.x86.avx512.mask.pmov.dw.mem.128"]
    fn vpmovdwmem128(mem_addr: *mut i8, a: i32x4, mask: u8);

    #[link_name = "llvm.x86.avx512.mask.pmovs.dw.mem.512"]
    fn vpmovsdwmem(mem_addr: *mut i8, a: i32x16, mask: u16);
    #[link_name = "llvm.x86.avx512.mask.pmovs.dw.mem.256"]
    fn vpmovsdwmem256(mem_addr: *mut i8, a: i32x8, mask: u8);
    #[link_name = "llvm.x86.avx512.mask.pmovs.dw.mem.128"]
    fn vpmovsdwmem128(mem_addr: *mut i8, a: i32x4, mask: u8);

    #[link_name = "llvm.x86.avx512.mask.pmovus.dw.mem.512"]
    fn vpmovusdwmem(mem_addr: *mut i8, a: i32x16, mask: u16);
    #[link_name = "llvm.x86.avx512.mask.pmovus.dw.mem.256"]
    fn vpmovusdwmem256(mem_addr: *mut i8, a: i32x8, mask: u8);
    #[link_name = "llvm.x86.avx512.mask.pmovus.dw.mem.128"]
    fn vpmovusdwmem128(mem_addr: *mut i8, a: i32x4, mask: u8);

    #[link_name = "llvm.x86.avx512.mask.pmov.db.mem.512"]
    fn vpmovdbmem(mem_addr: *mut i8, a: i32x16, mask: u16);
    #[link_name = "llvm.x86.avx512.mask.pmov.db.mem.256"]
    fn vpmovdbmem256(mem_addr: *mut i8, a: i32x8, mask: u8);
    #[link_name = "llvm.x86.avx512.mask.pmov.db.mem.128"]
    fn vpmovdbmem128(mem_addr: *mut i8, a: i32x4, mask: u8);

    #[link_name = "llvm.x86.avx512.mask.pmovs.db.mem.512"]
    fn vpmovsdbmem(mem_addr: *mut i8, a: i32x16, mask: u16);
    #[link_name = "llvm.x86.avx512.mask.pmovs.db.mem.256"]
    fn vpmovsdbmem256(mem_addr: *mut i8, a: i32x8, mask: u8);
    #[link_name = "llvm.x86.avx512.mask.pmovs.db.mem.128"]
    fn vpmovsdbmem128(mem_addr: *mut i8, a: i32x4, mask: u8);

    #[link_name = "llvm.x86.avx512.mask.pmovus.db.mem.512"]
    fn vpmovusdbmem(mem_addr: *mut i8, a: i32x16, mask: u16);
    #[link_name = "llvm.x86.avx512.mask.pmovus.db.mem.256"]
    fn vpmovusdbmem256(mem_addr: *mut i8, a: i32x8, mask: u8);
    #[link_name = "llvm.x86.avx512.mask.pmovus.db.mem.128"]
    fn vpmovusdbmem128(mem_addr: *mut i8, a: i32x4, mask: u8);

    #[link_name = "llvm.x86.avx512.mask.pmov.qw.mem.512"]
    fn vpmovqwmem(mem_addr: *mut i8, a: i64x8, mask: u8);
    #[link_name = "llvm.x86.avx512.mask.pmov.qw.mem.256"]
    fn vpmovqwmem256(mem_addr: *mut i8, a: i64x4, mask: u8);
    #[link_name = "llvm.x86.avx512.mask.pmov.qw.mem.128"]
    fn vpmovqwmem128(mem_addr: *mut i8, a: i64x2, mask: u8);

    #[link_name = "llvm.x86.avx512.mask.pmovs.qw.mem.512"]
    fn vpmovsqwmem(mem_addr: *mut i8, a: i64x8, mask: u8);
    #[link_name = "llvm.x86.avx512.mask.pmovs.qw.mem.256"]
    fn vpmovsqwmem256(mem_addr: *mut i8, a: i64x4, mask: u8);
    #[link_name = "llvm.x86.avx512.mask.pmovs.qw.mem.128"]
    fn vpmovsqwmem128(mem_addr: *mut i8, a: i64x2, mask: u8);

    #[link_name = "llvm.x86.avx512.mask.pmovus.qw.mem.512"]
    fn vpmovusqwmem(mem_addr: *mut i8, a: i64x8, mask: u8);
    #[link_name = "llvm.x86.avx512.mask.pmovus.qw.mem.256"]
    fn vpmovusqwmem256(mem_addr: *mut i8, a: i64x4, mask: u8);
    #[link_name = "llvm.x86.avx512.mask.pmovus.qw.mem.128"]
    fn vpmovusqwmem128(mem_addr: *mut i8, a: i64x2, mask: u8);

    #[link_name = "llvm.x86.avx512.mask.pmov.qb.mem.512"]
    fn vpmovqbmem(mem_addr: *mut i8, a: i64x8, mask: u8);
    #[link_name = "llvm.x86.avx512.mask.pmov.qb.mem.256"]
    fn vpmovqbmem256(mem_addr: *mut i8, a: i64x4, mask: u8);
    #[link_name = "llvm.x86.avx512.mask.pmov.qb.mem.128"]
    fn vpmovqbmem128(mem_addr: *mut i8, a: i64x2, mask: u8);

    #[link_name = "llvm.x86.avx512.mask.pmovs.qb.mem.512"]
    fn vpmovsqbmem(mem_addr: *mut i8, a: i64x8, mask: u8);
    #[link_name = "llvm.x86.avx512.mask.pmovs.qb.mem.256"]
    fn vpmovsqbmem256(mem_addr: *mut i8, a: i64x4, mask: u8);
    #[link_name = "llvm.x86.avx512.mask.pmovs.qb.mem.128"]
    fn vpmovsqbmem128(mem_addr: *mut i8, a: i64x2, mask: u8);

    #[link_name = "llvm.x86.avx512.mask.pmovus.qb.mem.512"]
    fn vpmovusqbmem(mem_addr: *mut i8, a: i64x8, mask: u8);
    #[link_name = "llvm.x86.avx512.mask.pmovus.qb.mem.256"]
    fn vpmovusqbmem256(mem_addr: *mut i8, a: i64x4, mask: u8);
    #[link_name = "llvm.x86.avx512.mask.pmovus.qb.mem.128"]
    fn vpmovusqbmem128(mem_addr: *mut i8, a: i64x2, mask: u8);

    #[link_name = "llvm.x86.avx512.mask.pmov.qd.mem.512"]
    fn vpmovqdmem(mem_addr: *mut i8, a: i64x8, mask: u8);
    #[link_name = "llvm.x86.avx512.mask.pmov.qd.mem.256"]
    fn vpmovqdmem256(mem_addr: *mut i8, a: i64x4, mask: u8);
    #[link_name = "llvm.x86.avx512.mask.pmov.qd.mem.128"]
    fn vpmovqdmem128(mem_addr: *mut i8, a: i64x2, mask: u8);

    #[link_name = "llvm.x86.avx512.mask.pmovs.qd.mem.512"]
    fn vpmovsqdmem(mem_addr: *mut i8, a: i64x8, mask: u8);
    #[link_name = "llvm.x86.avx512.mask.pmovs.qd.mem.256"]
    fn vpmovsqdmem256(mem_addr: *mut i8, a: i64x4, mask: u8);
    #[link_name = "llvm.x86.avx512.mask.pmovs.qd.mem.128"]
    fn vpmovsqdmem128(mem_addr: *mut i8, a: i64x2, mask: u8);

    #[link_name = "llvm.x86.avx512.mask.pmovus.qd.mem.512"]
    fn vpmovusqdmem(mem_addr: *mut i8, a: i64x8, mask: u8);
    #[link_name = "llvm.x86.avx512.mask.pmovus.qd.mem.256"]
    fn vpmovusqdmem256(mem_addr: *mut i8, a: i64x4, mask: u8);
    #[link_name = "llvm.x86.avx512.mask.pmovus.qd.mem.128"]
    fn vpmovusqdmem128(mem_addr: *mut i8, a: i64x2, mask: u8);

    #[link_name = "llvm.x86.avx512.mask.pmov.qb.512"]
    fn vpmovqb(a: i64x8, src: i8x16, mask: u8) -> i8x16;

    #[link_name = "llvm.x86.avx512.mask.pmovs.dw.512"]
    fn vpmovsdw(a: i32x16, src: i16x16, mask: u16) -> i16x16;
    #[link_name = "llvm.x86.avx512.mask.pmovs.dw.256"]
    fn vpmovsdw256(a: i32x8, src: i16x8, mask: u8) -> i16x8;
    #[link_name = "llvm.x86.avx512.mask.pmovs.dw.128"]
    fn vpmovsdw128(a: i32x4, src: i16x8, mask: u8) -> i16x8;

    #[link_name = "llvm.x86.avx512.mask.pmovs.db.512"]
    fn vpmovsdb(a: i32x16, src: i8x16, mask: u16) -> i8x16;
    #[link_name = "llvm.x86.avx512.mask.pmovs.db.256"]
    fn vpmovsdb256(a: i32x8, src: i8x16, mask: u8) -> i8x16;
    #[link_name = "llvm.x86.avx512.mask.pmovs.db.128"]
    fn vpmovsdb128(a: i32x4, src: i8x16, mask: u8) -> i8x16;

    #[link_name = "llvm.x86.avx512.mask.pmovs.qd.512"]
    fn vpmovsqd(a: i64x8, src: i32x8, mask: u8) -> i32x8;
    #[link_name = "llvm.x86.avx512.mask.pmovs.qd.256"]
    fn vpmovsqd256(a: i64x4, src: i32x4, mask: u8) -> i32x4;
    #[link_name = "llvm.x86.avx512.mask.pmovs.qd.128"]
    fn vpmovsqd128(a: i64x2, src: i32x4, mask: u8) -> i32x4;

    #[link_name = "llvm.x86.avx512.mask.pmovs.qw.512"]
    fn vpmovsqw(a: i64x8, src: i16x8, mask: u8) -> i16x8;
    #[link_name = "llvm.x86.avx512.mask.pmovs.qw.256"]
    fn vpmovsqw256(a: i64x4, src: i16x8, mask: u8) -> i16x8;
    #[link_name = "llvm.x86.avx512.mask.pmovs.qw.128"]
    fn vpmovsqw128(a: i64x2, src: i16x8, mask: u8) -> i16x8;

    #[link_name = "llvm.x86.avx512.mask.pmovs.qb.512"]
    fn vpmovsqb(a: i64x8, src: i8x16, mask: u8) -> i8x16;
    #[link_name = "llvm.x86.avx512.mask.pmovs.qb.256"]
    fn vpmovsqb256(a: i64x4, src: i8x16, mask: u8) -> i8x16;
    #[link_name = "llvm.x86.avx512.mask.pmovs.qb.128"]
    fn vpmovsqb128(a: i64x2, src: i8x16, mask: u8) -> i8x16;

    #[link_name = "llvm.x86.avx512.mask.pmovus.dw.512"]
    fn vpmovusdw(a: u32x16, src: u16x16, mask: u16) -> u16x16;
    #[link_name = "llvm.x86.avx512.mask.pmovus.dw.256"]
    fn vpmovusdw256(a: u32x8, src: u16x8, mask: u8) -> u16x8;
    #[link_name = "llvm.x86.avx512.mask.pmovus.dw.128"]
    fn vpmovusdw128(a: u32x4, src: u16x8, mask: u8) -> u16x8;

    #[link_name = "llvm.x86.avx512.mask.pmovus.db.512"]
    fn vpmovusdb(a: u32x16, src: u8x16, mask: u16) -> u8x16;
    #[link_name = "llvm.x86.avx512.mask.pmovus.db.256"]
    fn vpmovusdb256(a: u32x8, src: u8x16, mask: u8) -> u8x16;
    #[link_name = "llvm.x86.avx512.mask.pmovus.db.128"]
    fn vpmovusdb128(a: u32x4, src: u8x16, mask: u8) -> u8x16;

    #[link_name = "llvm.x86.avx512.mask.pmovus.qd.512"]
    fn vpmovusqd(a: u64x8, src: u32x8, mask: u8) -> u32x8;
    #[link_name = "llvm.x86.avx512.mask.pmovus.qd.256"]
    fn vpmovusqd256(a: u64x4, src: u32x4, mask: u8) -> u32x4;
    #[link_name = "llvm.x86.avx512.mask.pmovus.qd.128"]
    fn vpmovusqd128(a: u64x2, src: u32x4, mask: u8) -> u32x4;

    #[link_name = "llvm.x86.avx512.mask.pmovus.qw.512"]
    fn vpmovusqw(a: u64x8, src: u16x8, mask: u8) -> u16x8;
    #[link_name = "llvm.x86.avx512.mask.pmovus.qw.256"]
    fn vpmovusqw256(a: u64x4, src: u16x8, mask: u8) -> u16x8;
    #[link_name = "llvm.x86.avx512.mask.pmovus.qw.128"]
    fn vpmovusqw128(a: u64x2, src: u16x8, mask: u8) -> u16x8;

    #[link_name = "llvm.x86.avx512.mask.pmovus.qb.512"]
    fn vpmovusqb(a: u64x8, src: u8x16, mask: u8) -> u8x16;
    #[link_name = "llvm.x86.avx512.mask.pmovus.qb.256"]
    fn vpmovusqb256(a: u64x4, src: u8x16, mask: u8) -> u8x16;
    #[link_name = "llvm.x86.avx512.mask.pmovus.qb.128"]
    fn vpmovusqb128(a: u64x2, src: u8x16, mask: u8) -> u8x16;

    #[link_name = "llvm.x86.avx512.gather.dpd.512"]
    fn vgatherdpd(src: f64x8, slice: *const i8, offsets: i32x8, mask: i8, scale: i32) -> f64x8;
    #[link_name = "llvm.x86.avx512.gather.dps.512"]
    fn vgatherdps(src: f32x16, slice: *const i8, offsets: i32x16, mask: i16, scale: i32) -> f32x16;
    #[link_name = "llvm.x86.avx512.gather.qpd.512"]
    fn vgatherqpd(src: f64x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> f64x8;
    #[link_name = "llvm.x86.avx512.gather.qps.512"]
    fn vgatherqps(src: f32x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> f32x8;
    #[link_name = "llvm.x86.avx512.gather.dpq.512"]
    fn vpgatherdq(src: i64x8, slice: *const i8, offsets: i32x8, mask: i8, scale: i32) -> i64x8;
    #[link_name = "llvm.x86.avx512.gather.dpi.512"]
    fn vpgatherdd(src: i32x16, slice: *const i8, offsets: i32x16, mask: i16, scale: i32) -> i32x16;
    #[link_name = "llvm.x86.avx512.gather.qpq.512"]
    fn vpgatherqq(src: i64x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> i64x8;
    #[link_name = "llvm.x86.avx512.gather.qpi.512"]
    fn vpgatherqd(src: i32x8, slice: *const i8, offsets: i64x8, mask: i8, scale: i32) -> i32x8;

    #[link_name = "llvm.x86.avx512.scatter.dpd.512"]
    fn vscatterdpd(slice: *mut i8, mask: i8, offsets: i32x8, src: f64x8, scale: i32);
    #[link_name = "llvm.x86.avx512.scatter.dps.512"]
    fn vscatterdps(slice: *mut i8, mask: i16, offsets: i32x16, src: f32x16, scale: i32);
    #[link_name = "llvm.x86.avx512.scatter.qpd.512"]
    fn vscatterqpd(slice: *mut i8, mask: i8, offsets: i64x8, src: f64x8, scale: i32);
    #[link_name = "llvm.x86.avx512.scatter.qps.512"]
    fn vscatterqps(slice: *mut i8, mask: i8, offsets: i64x8, src: f32x8, scale: i32);
    #[link_name = "llvm.x86.avx512.scatter.dpq.512"]
    fn vpscatterdq(slice: *mut i8, mask: i8, offsets: i32x8, src: i64x8, scale: i32);

    #[link_name = "llvm.x86.avx512.scatter.dpi.512"]
    fn vpscatterdd(slice: *mut i8, mask: i16, offsets: i32x16, src: i32x16, scale: i32);
    #[link_name = "llvm.x86.avx512.scatter.qpq.512"]
    fn vpscatterqq(slice: *mut i8, mask: i8, offsets: i64x8, src: i64x8, scale: i32);
    #[link_name = "llvm.x86.avx512.scatter.qpi.512"]
    fn vpscatterqd(slice: *mut i8, mask: i8, offsets: i64x8, src: i32x8, scale: i32);

    #[link_name = "llvm.x86.avx512.mask.cmp.ss"]
    fn vcmpss(a: __m128, b: __m128, op: i32, m: i8, sae: i32) -> i8;
    #[link_name = "llvm.x86.avx512.mask.cmp.sd"]
    fn vcmpsd(a: __m128d, b: __m128d, op: i32, m: i8, sae: i32) -> i8;

    #[link_name = "llvm.x86.avx512.mask.cmp.ps.512"]
    fn vcmpps(a: f32x16, b: f32x16, op: i32, m: i16, sae: i32) -> i16;
    #[link_name = "llvm.x86.avx512.mask.cmp.ps.256"]
    fn vcmpps256(a: f32x8, b: f32x8, op: i32, m: i8) -> i8;
    #[link_name = "llvm.x86.avx512.mask.cmp.ps.128"]
    fn vcmpps128(a: f32x4, b: f32x4, op: i32, m: i8) -> i8;

    #[link_name = "llvm.x86.avx512.mask.cmp.pd.512"]
    fn vcmppd(a: f64x8, b: f64x8, op: i32, m: i8, sae: i32) -> i8;
    #[link_name = "llvm.x86.avx512.mask.cmp.pd.256"]
    fn vcmppd256(a: f64x4, b: f64x4, op: i32, m: i8) -> i8;
    #[link_name = "llvm.x86.avx512.mask.cmp.pd.128"]
    fn vcmppd128(a: f64x2, b: f64x2, op: i32, m: i8) -> i8;

    #[link_name = "llvm.x86.avx512.mask.ucmp.q.512"]
    fn vpcmpuq(a: i64x8, b: i64x8, op: i32, m: i8) -> i8;
    #[link_name = "llvm.x86.avx512.mask.ucmp.q.256"]
    fn vpcmpuq256(a: i64x4, b: i64x4, op: i32, m: i8) -> i8;
    #[link_name = "llvm.x86.avx512.mask.ucmp.q.128"]
    fn vpcmpuq128(a: i64x2, b: i64x2, op: i32, m: i8) -> i8;

    #[link_name = "llvm.x86.avx512.mask.cmp.q.512"]
    fn vpcmpq(a: i64x8, b: i64x8, op: i32, m: i8) -> i8;
    #[link_name = "llvm.x86.avx512.mask.cmp.q.256"]
    fn vpcmpq256(a: i64x4, b: i64x4, op: i32, m: i8) -> i8;
    #[link_name = "llvm.x86.avx512.mask.cmp.q.128"]
    fn vpcmpq128(a: i64x2, b: i64x2, op: i32, m: i8) -> i8;

    #[link_name = "llvm.x86.avx512.mask.ucmp.d.512"]
    fn vpcmpud(a: i32x16, b: i32x16, op: i32, m: i16) -> i16;
    #[link_name = "llvm.x86.avx512.mask.ucmp.d.256"]
    fn vpcmpud256(a: i32x8, b: i32x8, op: i32, m: i8) -> i8;
    #[link_name = "llvm.x86.avx512.mask.ucmp.d.128"]
    fn vpcmpud128(a: i32x4, b: i32x4, op: i32, m: i8) -> i8;

    #[link_name = "llvm.x86.avx512.mask.cmp.d.512"]
    fn vpcmpd(a: i32x16, b: i32x16, op: i32, m: i16) -> i16;
    #[link_name = "llvm.x86.avx512.mask.cmp.d.256"]
    fn vpcmpd256(a: i32x8, b: i32x8, op: i32, m: i8) -> i8;
    #[link_name = "llvm.x86.avx512.mask.cmp.d.128"]
    fn vpcmpd128(a: i32x4, b: i32x4, op: i32, m: i8) -> i8;

    #[link_name = "llvm.x86.avx512.mask.prol.d.512"]
    fn vprold(a: i32x16, i8: i32) -> i32x16;
    #[link_name = "llvm.x86.avx512.mask.prol.d.256"]
    fn vprold256(a: i32x8, i8: i32) -> i32x8;
    #[link_name = "llvm.x86.avx512.mask.prol.d.128"]
    fn vprold128(a: i32x4, i8: i32) -> i32x4;

    #[link_name = "llvm.x86.avx512.mask.pror.d.512"]
    fn vprord(a: i32x16, i8: i32) -> i32x16;
    #[link_name = "llvm.x86.avx512.mask.pror.d.256"]
    fn vprord256(a: i32x8, i8: i32) -> i32x8;
    #[link_name = "llvm.x86.avx512.mask.pror.d.128"]
    fn vprord128(a: i32x4, i8: i32) -> i32x4;

    #[link_name = "llvm.x86.avx512.mask.prol.q.512"]
    fn vprolq(a: i64x8, i8: i32) -> i64x8;
    #[link_name = "llvm.x86.avx512.mask.prol.q.256"]
    fn vprolq256(a: i64x4, i8: i32) -> i64x4;
    #[link_name = "llvm.x86.avx512.mask.prol.q.128"]
    fn vprolq128(a: i64x2, i8: i32) -> i64x2;

    #[link_name = "llvm.x86.avx512.mask.pror.q.512"]
    fn vprorq(a: i64x8, i8: i32) -> i64x8;
    #[link_name = "llvm.x86.avx512.mask.pror.q.256"]
    fn vprorq256(a: i64x4, i8: i32) -> i64x4;
    #[link_name = "llvm.x86.avx512.mask.pror.q.128"]
    fn vprorq128(a: i64x2, i8: i32) -> i64x2;

    #[link_name = "llvm.x86.avx512.mask.prolv.d.512"]
    fn vprolvd(a: i32x16, b: i32x16) -> i32x16;
    #[link_name = "llvm.x86.avx512.mask.prolv.d.256"]
    fn vprolvd256(a: i32x8, b: i32x8) -> i32x8;
    #[link_name = "llvm.x86.avx512.mask.prolv.d.128"]
    fn vprolvd128(a: i32x4, b: i32x4) -> i32x4;

    #[link_name = "llvm.x86.avx512.mask.prorv.d.512"]
    fn vprorvd(a: i32x16, b: i32x16) -> i32x16;
    #[link_name = "llvm.x86.avx512.mask.prorv.d.256"]
    fn vprorvd256(a: i32x8, b: i32x8) -> i32x8;
    #[link_name = "llvm.x86.avx512.mask.prorv.d.128"]
    fn vprorvd128(a: i32x4, b: i32x4) -> i32x4;

    #[link_name = "llvm.x86.avx512.mask.prolv.q.512"]
    fn vprolvq(a: i64x8, b: i64x8) -> i64x8;
    #[link_name = "llvm.x86.avx512.mask.prolv.q.256"]
    fn vprolvq256(a: i64x4, b: i64x4) -> i64x4;
    #[link_name = "llvm.x86.avx512.mask.prolv.q.128"]
    fn vprolvq128(a: i64x2, b: i64x2) -> i64x2;

    #[link_name = "llvm.x86.avx512.mask.prorv.q.512"]
    fn vprorvq(a: i64x8, b: i64x8) -> i64x8;
    #[link_name = "llvm.x86.avx512.mask.prorv.q.256"]
    fn vprorvq256(a: i64x4, b: i64x4) -> i64x4;
    #[link_name = "llvm.x86.avx512.mask.prorv.q.128"]
    fn vprorvq128(a: i64x2, b: i64x2) -> i64x2;

    #[link_name = "llvm.x86.avx512.psllv.d.512"]
    fn vpsllvd(a: i32x16, b: i32x16) -> i32x16;
    #[link_name = "llvm.x86.avx512.psrlv.d.512"]
    fn vpsrlvd(a: i32x16, b: i32x16) -> i32x16;
    #[link_name = "llvm.x86.avx512.psllv.q.512"]
    fn vpsllvq(a: i64x8, b: i64x8) -> i64x8;
    #[link_name = "llvm.x86.avx512.psrlv.q.512"]
    fn vpsrlvq(a: i64x8, b: i64x8) -> i64x8;

    #[link_name = "llvm.x86.avx512.pslli.d.512"]
    fn vpsllid(a: i32x16, imm8: u32) -> i32x16;

    #[link_name = "llvm.x86.avx2.pslli.d"]
    fn psllid256(a: i32x8, imm8: i32) -> i32x8;
    #[link_name = "llvm.x86.sse2.pslli.d"]
    fn psllid128(a: i32x4, imm8: i32) -> i32x4;

    #[link_name = "llvm.x86.avx512.psrli.d.512"]
    fn vpsrlid(a: i32x16, imm8: u32) -> i32x16;

    #[link_name = "llvm.x86.avx2.psrli.d"]
    fn psrlid256(a: i32x8, imm8: i32) -> i32x8;
    #[link_name = "llvm.x86.sse2.psrli.d"]
    fn psrlid128(a: i32x4, imm8: i32) -> i32x4;

    #[link_name = "llvm.x86.avx512.pslli.q.512"]
    fn vpslliq(a: i64x8, imm8: u32) -> i64x8;

    #[link_name = "llvm.x86.avx2.pslli.q"]
    fn pslliq256(a: i64x4, imm8: i32) -> i64x4;
    #[link_name = "llvm.x86.sse2.pslli.q"]
    fn pslliq128(a: i64x2, imm8: i32) -> i64x2;

    #[link_name = "llvm.x86.avx512.psrli.q.512"]
    fn vpsrliq(a: i64x8, imm8: u32) -> i64x8;

    #[link_name = "llvm.x86.avx2.psrli.q"]
    fn psrliq256(a: i64x4, imm8: i32) -> i64x4;
    #[link_name = "llvm.x86.sse2.psrli.q"]
    fn psrliq128(a: i64x2, imm8: i32) -> i64x2;

    #[link_name = "llvm.x86.avx512.psll.d.512"]
    fn vpslld(a: i32x16, count: i32x4) -> i32x16;
    #[link_name = "llvm.x86.avx512.psrl.d.512"]
    fn vpsrld(a: i32x16, count: i32x4) -> i32x16;
    #[link_name = "llvm.x86.avx512.psll.q.512"]
    fn vpsllq(a: i64x8, count: i64x2) -> i64x8;
    #[link_name = "llvm.x86.avx512.psrl.q.512"]
    fn vpsrlq(a: i64x8, count: i64x2) -> i64x8;

    #[link_name = "llvm.x86.avx512.psra.d.512"]
    fn vpsrad(a: i32x16, count: i32x4) -> i32x16;

    #[link_name = "llvm.x86.avx512.psra.q.512"]
    fn vpsraq(a: i64x8, count: i64x2) -> i64x8;
    #[link_name = "llvm.x86.avx512.psra.q.256"]
    fn vpsraq256(a: i64x4, count: i64x2) -> i64x4;
    #[link_name = "llvm.x86.avx512.psra.q.128"]
    fn vpsraq128(a: i64x2, count: i64x2) -> i64x2;

    #[link_name = "llvm.x86.avx512.psrai.d.512"]
    fn vpsraid512(a: i32x16, imm8: u32) -> i32x16;
    #[link_name = "llvm.x86.avx2.psrai.d"]
    fn psraid256(a: i32x8, imm8: i32) -> i32x8;
    #[link_name = "llvm.x86.sse2.psrai.d"]
    fn psraid128(a: i32x4, imm8: i32) -> i32x4;

    #[link_name = "llvm.x86.avx512.psrai.q.512"]
    fn vpsraiq(a: i64x8, imm8: u32) -> i64x8;
    #[link_name = "llvm.x86.avx512.psrai.q.256"]
    fn vpsraiq256(a: i64x4, imm8: u32) -> i64x4;
    #[link_name = "llvm.x86.avx512.psrai.q.128"]
    fn vpsraiq128(a: i64x2, imm8: u32) -> i64x2;

    #[link_name = "llvm.x86.avx512.psrav.d.512"]
    fn vpsravd(a: i32x16, count: i32x16) -> i32x16;

    #[link_name = "llvm.x86.avx512.psrav.q.512"]
    fn vpsravq(a: i64x8, count: i64x8) -> i64x8;
    #[link_name = "llvm.x86.avx512.psrav.q.256"]
    fn vpsravq256(a: i64x4, count: i64x4) -> i64x4;
    #[link_name = "llvm.x86.avx512.psrav.q.128"]
    fn vpsravq128(a: i64x2, count: i64x2) -> i64x2;

    #[link_name = "llvm.x86.avx512.vpermilvar.ps.512"]
    fn vpermilps(a: f32x16, b: i32x16) -> f32x16;
    #[link_name = "llvm.x86.avx512.vpermilvar.pd.512"]
    fn vpermilpd(a: f64x8, b: i64x8) -> f64x8;

    #[link_name = "llvm.x86.avx512.permvar.si.512"]
    fn vpermd(a: i32x16, idx: i32x16) -> i32x16;

    #[link_name = "llvm.x86.avx512.permvar.di.512"]
    fn vpermq(a: i64x8, idx: i64x8) -> i64x8;
    #[link_name = "llvm.x86.avx512.permvar.di.256"]
    fn vpermq256(a: i64x4, idx: i64x4) -> i64x4;

    #[link_name = "llvm.x86.avx512.permvar.sf.512"]
    fn vpermps(a: f32x16, idx: i32x16) -> f32x16;

    #[link_name = "llvm.x86.avx512.permvar.df.512"]
    fn vpermpd(a: f64x8, idx: i64x8) -> f64x8;
    #[link_name = "llvm.x86.avx512.permvar.df.256"]
    fn vpermpd256(a: f64x4, idx: i64x4) -> f64x4;

    #[link_name = "llvm.x86.avx512.vpermi2var.d.512"]
    fn vpermi2d(a: i32x16, idx: i32x16, b: i32x16) -> i32x16;
    #[link_name = "llvm.x86.avx512.vpermi2var.d.256"]
    fn vpermi2d256(a: i32x8, idx: i32x8, b: i32x8) -> i32x8;
    #[link_name = "llvm.x86.avx512.vpermi2var.d.128"]
    fn vpermi2d128(a: i32x4, idx: i32x4, b: i32x4) -> i32x4;

    #[link_name = "llvm.x86.avx512.vpermi2var.q.512"]
    fn vpermi2q(a: i64x8, idx: i64x8, b: i64x8) -> i64x8;
    #[link_name = "llvm.x86.avx512.vpermi2var.q.256"]
    fn vpermi2q256(a: i64x4, idx: i64x4, b: i64x4) -> i64x4;
    #[link_name = "llvm.x86.avx512.vpermi2var.q.128"]
    fn vpermi2q128(a: i64x2, idx: i64x2, b: i64x2) -> i64x2;

    #[link_name = "llvm.x86.avx512.vpermi2var.ps.512"]
    fn vpermi2ps(a: f32x16, idx: i32x16, b: f32x16) -> f32x16;
    #[link_name = "llvm.x86.avx512.vpermi2var.ps.256"]
    fn vpermi2ps256(a: f32x8, idx: i32x8, b: f32x8) -> f32x8;
    #[link_name = "llvm.x86.avx512.vpermi2var.ps.128"]
    fn vpermi2ps128(a: f32x4, idx: i32x4, b: f32x4) -> f32x4;

    #[link_name = "llvm.x86.avx512.vpermi2var.pd.512"]
    fn vpermi2pd(a: f64x8, idx: i64x8, b: f64x8) -> f64x8;
    #[link_name = "llvm.x86.avx512.vpermi2var.pd.256"]
    fn vpermi2pd256(a: f64x4, idx: i64x4, b: f64x4) -> f64x4;
    #[link_name = "llvm.x86.avx512.vpermi2var.pd.128"]
    fn vpermi2pd128(a: f64x2, idx: i64x2, b: f64x2) -> f64x2;

    #[link_name = "llvm.x86.avx512.mask.compress.d.512"]
    fn vpcompressd(a: i32x16, src: i32x16, mask: u16) -> i32x16;
    #[link_name = "llvm.x86.avx512.mask.compress.d.256"]
    fn vpcompressd256(a: i32x8, src: i32x8, mask: u8) -> i32x8;
    #[link_name = "llvm.x86.avx512.mask.compress.d.128"]
    fn vpcompressd128(a: i32x4, src: i32x4, mask: u8) -> i32x4;

    #[link_name = "llvm.x86.avx512.mask.compress.q.512"]
    fn vpcompressq(a: i64x8, src: i64x8, mask: u8) -> i64x8;
    #[link_name = "llvm.x86.avx512.mask.compress.q.256"]
    fn vpcompressq256(a: i64x4, src: i64x4, mask: u8) -> i64x4;
    #[link_name = "llvm.x86.avx512.mask.compress.q.128"]
    fn vpcompressq128(a: i64x2, src: i64x2, mask: u8) -> i64x2;

    #[link_name = "llvm.x86.avx512.mask.compress.ps.512"]
    fn vcompressps(a: f32x16, src: f32x16, mask: u16) -> f32x16;
    #[link_name = "llvm.x86.avx512.mask.compress.ps.256"]
    fn vcompressps256(a: f32x8, src: f32x8, mask: u8) -> f32x8;
    #[link_name = "llvm.x86.avx512.mask.compress.ps.128"]
    fn vcompressps128(a: f32x4, src: f32x4, mask: u8) -> f32x4;

    #[link_name = "llvm.x86.avx512.mask.compress.pd.512"]
    fn vcompresspd(a: f64x8, src: f64x8, mask: u8) -> f64x8;
    #[link_name = "llvm.x86.avx512.mask.compress.pd.256"]
    fn vcompresspd256(a: f64x4, src: f64x4, mask: u8) -> f64x4;
    #[link_name = "llvm.x86.avx512.mask.compress.pd.128"]
    fn vcompresspd128(a: f64x2, src: f64x2, mask: u8) -> f64x2;

    #[link_name = "llvm.x86.avx512.mask.compress.store.d.512"]
    fn vcompressstored(mem: *mut i8, data: i32x16, mask: u16);
    #[link_name = "llvm.x86.avx512.mask.compress.store.d.256"]
    fn vcompressstored256(mem: *mut i8, data: i32x8, mask: u8);
    #[link_name = "llvm.x86.avx512.mask.compress.store.d.128"]
    fn vcompressstored128(mem: *mut i8, data: i32x4, mask: u8);

    #[link_name = "llvm.x86.avx512.mask.compress.store.q.512"]
    fn vcompressstoreq(mem: *mut i8, data: i64x8, mask: u8);
    #[link_name = "llvm.x86.avx512.mask.compress.store.q.256"]
    fn vcompressstoreq256(mem: *mut i8, data: i64x4, mask: u8);
    #[link_name = "llvm.x86.avx512.mask.compress.store.q.128"]
    fn vcompressstoreq128(mem: *mut i8, data: i64x2, mask: u8);

    #[link_name = "llvm.x86.avx512.mask.compress.store.ps.512"]
    fn vcompressstoreps(mem: *mut i8, data: f32x16, mask: u16);
    #[link_name = "llvm.x86.avx512.mask.compress.store.ps.256"]
    fn vcompressstoreps256(mem: *mut i8, data: f32x8, mask: u8);
    #[link_name = "llvm.x86.avx512.mask.compress.store.ps.128"]
    fn vcompressstoreps128(mem: *mut i8, data: f32x4, mask: u8);

    #[link_name = "llvm.x86.avx512.mask.compress.store.pd.512"]
    fn vcompressstorepd(mem: *mut i8, data: f64x8, mask: u8);
    #[link_name = "llvm.x86.avx512.mask.compress.store.pd.256"]
    fn vcompressstorepd256(mem: *mut i8, data: f64x4, mask: u8);
    #[link_name = "llvm.x86.avx512.mask.compress.store.pd.128"]
    fn vcompressstorepd128(mem: *mut i8, data: f64x2, mask: u8);

    #[link_name = "llvm.x86.avx512.mask.expand.d.512"]
    fn vpexpandd(a: i32x16, src: i32x16, mask: u16) -> i32x16;
    #[link_name = "llvm.x86.avx512.mask.expand.d.256"]
    fn vpexpandd256(a: i32x8, src: i32x8, mask: u8) -> i32x8;
    #[link_name = "llvm.x86.avx512.mask.expand.d.128"]
    fn vpexpandd128(a: i32x4, src: i32x4, mask: u8) -> i32x4;

    #[link_name = "llvm.x86.avx512.mask.expand.q.512"]
    fn vpexpandq(a: i64x8, src: i64x8, mask: u8) -> i64x8;
    #[link_name = "llvm.x86.avx512.mask.expand.q.256"]
    fn vpexpandq256(a: i64x4, src: i64x4, mask: u8) -> i64x4;
    #[link_name = "llvm.x86.avx512.mask.expand.q.128"]
    fn vpexpandq128(a: i64x2, src: i64x2, mask: u8) -> i64x2;

    #[link_name = "llvm.x86.avx512.mask.expand.ps.512"]
    fn vexpandps(a: f32x16, src: f32x16, mask: u16) -> f32x16;
    #[link_name = "llvm.x86.avx512.mask.expand.ps.256"]
    fn vexpandps256(a: f32x8, src: f32x8, mask: u8) -> f32x8;
    #[link_name = "llvm.x86.avx512.mask.expand.ps.128"]
    fn vexpandps128(a: f32x4, src: f32x4, mask: u8) -> f32x4;

    #[link_name = "llvm.x86.avx512.mask.expand.pd.512"]
    fn vexpandpd(a: f64x8, src: f64x8, mask: u8) -> f64x8;
    #[link_name = "llvm.x86.avx512.mask.expand.pd.256"]
    fn vexpandpd256(a: f64x4, src: f64x4, mask: u8) -> f64x4;
    #[link_name = "llvm.x86.avx512.mask.expand.pd.128"]
    fn vexpandpd128(a: f64x2, src: f64x2, mask: u8) -> f64x2;

    #[link_name = "llvm.x86.avx512.mask.add.ss.round"]
    fn vaddss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4;
    #[link_name = "llvm.x86.avx512.mask.add.sd.round"]
    fn vaddsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2;
    #[link_name = "llvm.x86.avx512.mask.sub.ss.round"]
    fn vsubss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4;
    #[link_name = "llvm.x86.avx512.mask.sub.sd.round"]
    fn vsubsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2;
    #[link_name = "llvm.x86.avx512.mask.mul.ss.round"]
    fn vmulss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4;
    #[link_name = "llvm.x86.avx512.mask.mul.sd.round"]
    fn vmulsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2;
    #[link_name = "llvm.x86.avx512.mask.div.ss.round"]
    fn vdivss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4;
    #[link_name = "llvm.x86.avx512.mask.div.sd.round"]
    fn vdivsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2;
    #[link_name = "llvm.x86.avx512.mask.max.ss.round"]
    fn vmaxss(a: f32x4, b: f32x4, src: f32x4, mask: u8, sae: i32) -> f32x4;
    #[link_name = "llvm.x86.avx512.mask.max.sd.round"]
    fn vmaxsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, sae: i32) -> f64x2;
    #[link_name = "llvm.x86.avx512.mask.min.ss.round"]
    fn vminss(a: f32x4, b: f32x4, src: f32x4, mask: u8, sae: i32) -> f32x4;
    #[link_name = "llvm.x86.avx512.mask.min.sd.round"]
    fn vminsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, sae: i32) -> f64x2;
    #[link_name = "llvm.x86.avx512.mask.sqrt.ss"]
    fn vsqrtss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4;
    #[link_name = "llvm.x86.avx512.mask.sqrt.sd"]
    fn vsqrtsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2;
    #[link_name = "llvm.x86.avx512.mask.getexp.ss"]
    fn vgetexpss(a: f32x4, b: f32x4, src: f32x4, mask: u8, sae: i32) -> f32x4;
    #[link_name = "llvm.x86.avx512.mask.getexp.sd"]
    fn vgetexpsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, sae: i32) -> f64x2;
    #[link_name = "llvm.x86.avx512.mask.getmant.ss"]
    fn vgetmantss(a: f32x4, b: f32x4, mantissas: i32, src: f32x4, m: u8, sae: i32) -> f32x4;
    #[link_name = "llvm.x86.avx512.mask.getmant.sd"]
    fn vgetmantsd(a: f64x2, b: f64x2, mantissas: i32, src: f64x2, m: u8, sae: i32) -> f64x2;

    #[link_name = "llvm.x86.avx512.rsqrt14.ss"]
    fn vrsqrt14ss(a: f32x4, b: f32x4, src: f32x4, mask: u8) -> f32x4;
    #[link_name = "llvm.x86.avx512.rsqrt14.sd"]
    fn vrsqrt14sd(a: f64x2, b: f64x2, src: f64x2, mask: u8) -> f64x2;
    #[link_name = "llvm.x86.avx512.rcp14.ss"]
    fn vrcp14ss(a: f32x4, b: f32x4, src: f32x4, mask: u8) -> f32x4;
    #[link_name = "llvm.x86.avx512.rcp14.sd"]
    fn vrcp14sd(a: f64x2, b: f64x2, src: f64x2, mask: u8) -> f64x2;

    #[link_name = "llvm.x86.avx512.mask.rndscale.ss"]
    fn vrndscaless(a: f32x4, b: f32x4, src: f32x4, mask: u8, imm8: i32, sae: i32) -> f32x4;
    #[link_name = "llvm.x86.avx512.mask.rndscale.sd"]
    fn vrndscalesd(a: f64x2, b: f64x2, src: f64x2, mask: u8, imm8: i32, sae: i32) -> f64x2;
    #[link_name = "llvm.x86.avx512.mask.scalef.ss"]
    fn vscalefss(a: f32x4, b: f32x4, src: f32x4, mask: u8, rounding: i32) -> f32x4;
    #[link_name = "llvm.x86.avx512.mask.scalef.sd"]
    fn vscalefsd(a: f64x2, b: f64x2, src: f64x2, mask: u8, rounding: i32) -> f64x2;

    #[link_name = "llvm.x86.avx512.vfmadd.f32"]
    fn vfmadd132ss(a: f32, b: f32, c: f32, rounding: i32) -> f32;
    #[link_name = "llvm.x86.avx512.vfmadd.f64"]
    fn vfmadd132sd(a: f64, b: f64, c: f64, rounding: i32) -> f64;

    #[link_name = "llvm.x86.avx512.mask.fixupimm.ss"]
    fn vfixupimmss(a: f32x4, b: f32x4, c: i32x4, imm8: i32, mask: u8, sae: i32) -> f32x4;
    #[link_name = "llvm.x86.avx512.mask.fixupimm.sd"]
    fn vfixupimmsd(a: f64x2, b: f64x2, c: i64x2, imm8: i32, mask: u8, sae: i32) -> f64x2;
    #[link_name = "llvm.x86.avx512.maskz.fixupimm.ss"]
    fn vfixupimmssz(a: f32x4, b: f32x4, c: i32x4, imm8: i32, mask: u8, sae: i32) -> f32x4;
    #[link_name = "llvm.x86.avx512.maskz.fixupimm.sd"]
    fn vfixupimmsdz(a: f64x2, b: f64x2, c: i64x2, imm8: i32, mask: u8, sae: i32) -> f64x2;

    #[link_name = "llvm.x86.avx512.mask.cvtss2sd.round"]
    fn vcvtss2sd(a: f64x2, a: f32x4, src: f64x2, mask: u8, sae: i32) -> f64x2;
    #[link_name = "llvm.x86.avx512.mask.cvtsd2ss.round"]
    fn vcvtsd2ss(a: f32x4, b: f64x2, src: f32x4, mask: u8, rounding: i32) -> f32x4;

    #[link_name = "llvm.x86.avx512.vcvtss2si32"]
    fn vcvtss2si(a: f32x4, rounding: i32) -> i32;
    #[link_name = "llvm.x86.avx512.vcvtss2usi32"]
    fn vcvtss2usi(a: f32x4, rounding: i32) -> u32;

    #[link_name = "llvm.x86.avx512.vcvtsd2si32"]
    fn vcvtsd2si(a: f64x2, rounding: i32) -> i32;
    #[link_name = "llvm.x86.avx512.vcvtsd2usi32"]
    fn vcvtsd2usi(a: f64x2, rounding: i32) -> u32;

    #[link_name = "llvm.x86.avx512.cvtsi2ss32"]
    fn vcvtsi2ss(a: f32x4, b: i32, rounding: i32) -> f32x4;
    #[link_name = "llvm.x86.avx512.cvtsi2sd64"]
    fn vcvtsi2sd(a: f64x2, b: i64, rounding: i32) -> f64x2;

    #[link_name = "llvm.x86.avx512.cvtusi2ss"]
    fn vcvtusi2ss(a: f32x4, b: u32, rounding: i32) -> f32x4;
    #[link_name = "llvm.x86.avx512.cvtusi642sd"]
    fn vcvtusi2sd(a: f64x2, b: u64, rounding: i32) -> f64x2;

    #[link_name = "llvm.x86.avx512.vcomi.ss"]
    fn vcomiss(a: f32x4, b: f32x4, imm8: i32, sae: i32) -> i32;
    #[link_name = "llvm.x86.avx512.vcomi.sd"]
    fn vcomisd(a: f64x2, b: f64x2, imm8: i32, sae: i32) -> i32;
}

#[cfg(test)]
mod tests {

    use stdarch_test::simd_test;

    use crate::core_arch::x86::*;
    use crate::hint::black_box;
    use crate::mem::{self};

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_abs_epi32() {
        #[rustfmt::skip]
        let a = _mm512_setr_epi32(
            0, 1, -1, i32::MAX,
            i32::MIN, 100, -100, -32,
            0, 1, -1, i32::MAX,
            i32::MIN, 100, -100, -32,
        );
        let r = _mm512_abs_epi32(a);
        #[rustfmt::skip]
        let e = _mm512_setr_epi32(
            0, 1, 1, i32::MAX,
            i32::MAX.wrapping_add(1), 100, 100, 32,
            0, 1, 1, i32::MAX,
            i32::MAX.wrapping_add(1), 100, 100, 32,
        );
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_abs_epi32() {
        #[rustfmt::skip]
        let a = _mm512_setr_epi32(
            0, 1, -1, i32::MAX,
            i32::MIN, 100, -100, -32,
            0, 1, -1, i32::MAX,
            i32::MIN, 100, -100, -32,
        );
        let r = _mm512_mask_abs_epi32(a, 0, a);
        assert_eq_m512i(r, a);
        let r = _mm512_mask_abs_epi32(a, 0b00000000_11111111, a);
        #[rustfmt::skip]
        let e = _mm512_setr_epi32(
            0, 1, 1, i32::MAX,
            i32::MAX.wrapping_add(1), 100, 100, 32,
            0, 1, -1, i32::MAX,
            i32::MIN, 100, -100, -32,
        );
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_abs_epi32() {
        #[rustfmt::skip]
        let a = _mm512_setr_epi32(
            0, 1, -1, i32::MAX,
            i32::MIN, 100, -100, -32,
            0, 1, -1, i32::MAX,
            i32::MIN, 100, -100, -32,
        );
        let r = _mm512_maskz_abs_epi32(0, a);
        assert_eq_m512i(r, _mm512_setzero_si512());
        let r = _mm512_maskz_abs_epi32(0b00000000_11111111, a);
        #[rustfmt::skip]
        let e = _mm512_setr_epi32(
            0, 1, 1, i32::MAX,
            i32::MAX.wrapping_add(1), 100, 100, 32,
            0, 0, 0, 0,
            0, 0, 0, 0,
        );
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_abs_epi32() {
        #[rustfmt::skip]
        let a = _mm256_setr_epi32(
            0, 1, -1, i32::MAX,
            i32::MIN, 100, -100, -32,
        );
        let r = _mm256_mask_abs_epi32(a, 0, a);
        assert_eq_m256i(r, a);
        let r = _mm256_mask_abs_epi32(a, 0b00001111, a);
        #[rustfmt::skip]
        let e = _mm256_setr_epi32(
            0, 1, 1, i32::MAX,
            i32::MAX.wrapping_add(1), 100, -100, -32,
        );
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_abs_epi32() {
        #[rustfmt::skip]
        let a = _mm256_setr_epi32(
            0, 1, -1, i32::MAX,
            i32::MIN, 100, -100, -32,
        );
        let r = _mm256_maskz_abs_epi32(0, a);
        assert_eq_m256i(r, _mm256_setzero_si256());
        let r = _mm256_maskz_abs_epi32(0b00001111, a);
        #[rustfmt::skip]
        let e = _mm256_setr_epi32(
            0, 1, 1, i32::MAX,
            0, 0, 0, 0,
        );
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_abs_epi32() {
        let a = _mm_setr_epi32(i32::MIN, 100, -100, -32);
        let r = _mm_mask_abs_epi32(a, 0, a);
        assert_eq_m128i(r, a);
        let r = _mm_mask_abs_epi32(a, 0b00001111, a);
        let e = _mm_setr_epi32(i32::MAX.wrapping_add(1), 100, 100, 32);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_abs_epi32() {
        let a = _mm_setr_epi32(i32::MIN, 100, -100, -32);
        let r = _mm_maskz_abs_epi32(0, a);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm_maskz_abs_epi32(0b00001111, a);
        let e = _mm_setr_epi32(i32::MAX.wrapping_add(1), 100, 100, 32);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_abs_ps() {
        #[rustfmt::skip]
        let a = _mm512_setr_ps(
            0., 1., -1., f32::MAX,
            f32::MIN, 100., -100., -32.,
            0., 1., -1., f32::MAX,
            f32::MIN, 100., -100., -32.,
        );
        let r = _mm512_abs_ps(a);
        #[rustfmt::skip]
        let e = _mm512_setr_ps(
            0., 1., 1., f32::MAX,
            f32::MAX, 100., 100., 32.,
            0., 1., 1., f32::MAX,
            f32::MAX, 100., 100., 32.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_abs_ps() {
        #[rustfmt::skip]
        let a = _mm512_setr_ps(
            0., 1., -1., f32::MAX,
            f32::MIN, 100., -100., -32.,
            0., 1., -1., f32::MAX,
            f32::MIN, 100., -100., -32.,
        );
        let r = _mm512_mask_abs_ps(a, 0, a);
        assert_eq_m512(r, a);
        let r = _mm512_mask_abs_ps(a, 0b00000000_11111111, a);
        #[rustfmt::skip]
        let e = _mm512_setr_ps(
            0., 1., 1., f32::MAX,
            f32::MAX, 100., 100., 32.,
            0., 1., -1., f32::MAX,
            f32::MIN, 100., -100., -32.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_mov_epi32() {
        let src = _mm512_set1_epi32(1);
        let a = _mm512_set1_epi32(2);
        let r = _mm512_mask_mov_epi32(src, 0, a);
        assert_eq_m512i(r, src);
        let r = _mm512_mask_mov_epi32(src, 0b11111111_11111111, a);
        assert_eq_m512i(r, a);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_mov_epi32() {
        let a = _mm512_set1_epi32(2);
        let r = _mm512_maskz_mov_epi32(0, a);
        assert_eq_m512i(r, _mm512_setzero_si512());
        let r = _mm512_maskz_mov_epi32(0b11111111_11111111, a);
        assert_eq_m512i(r, a);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_mov_epi32() {
        let src = _mm256_set1_epi32(1);
        let a = _mm256_set1_epi32(2);
        let r = _mm256_mask_mov_epi32(src, 0, a);
        assert_eq_m256i(r, src);
        let r = _mm256_mask_mov_epi32(src, 0b11111111, a);
        assert_eq_m256i(r, a);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_mov_epi32() {
        let a = _mm256_set1_epi32(2);
        let r = _mm256_maskz_mov_epi32(0, a);
        assert_eq_m256i(r, _mm256_setzero_si256());
        let r = _mm256_maskz_mov_epi32(0b11111111, a);
        assert_eq_m256i(r, a);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_mov_epi32() {
        let src = _mm_set1_epi32(1);
        let a = _mm_set1_epi32(2);
        let r = _mm_mask_mov_epi32(src, 0, a);
        assert_eq_m128i(r, src);
        let r = _mm_mask_mov_epi32(src, 0b00001111, a);
        assert_eq_m128i(r, a);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_mov_epi32() {
        let a = _mm_set1_epi32(2);
        let r = _mm_maskz_mov_epi32(0, a);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm_maskz_mov_epi32(0b00001111, a);
        assert_eq_m128i(r, a);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_mov_ps() {
        let src = _mm512_set1_ps(1.);
        let a = _mm512_set1_ps(2.);
        let r = _mm512_mask_mov_ps(src, 0, a);
        assert_eq_m512(r, src);
        let r = _mm512_mask_mov_ps(src, 0b11111111_11111111, a);
        assert_eq_m512(r, a);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_mov_ps() {
        let a = _mm512_set1_ps(2.);
        let r = _mm512_maskz_mov_ps(0, a);
        assert_eq_m512(r, _mm512_setzero_ps());
        let r = _mm512_maskz_mov_ps(0b11111111_11111111, a);
        assert_eq_m512(r, a);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_mov_ps() {
        let src = _mm256_set1_ps(1.);
        let a = _mm256_set1_ps(2.);
        let r = _mm256_mask_mov_ps(src, 0, a);
        assert_eq_m256(r, src);
        let r = _mm256_mask_mov_ps(src, 0b11111111, a);
        assert_eq_m256(r, a);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_mov_ps() {
        let a = _mm256_set1_ps(2.);
        let r = _mm256_maskz_mov_ps(0, a);
        assert_eq_m256(r, _mm256_setzero_ps());
        let r = _mm256_maskz_mov_ps(0b11111111, a);
        assert_eq_m256(r, a);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_mov_ps() {
        let src = _mm_set1_ps(1.);
        let a = _mm_set1_ps(2.);
        let r = _mm_mask_mov_ps(src, 0, a);
        assert_eq_m128(r, src);
        let r = _mm_mask_mov_ps(src, 0b00001111, a);
        assert_eq_m128(r, a);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_mov_ps() {
        let a = _mm_set1_ps(2.);
        let r = _mm_maskz_mov_ps(0, a);
        assert_eq_m128(r, _mm_setzero_ps());
        let r = _mm_maskz_mov_ps(0b00001111, a);
        assert_eq_m128(r, a);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_add_epi32() {
        #[rustfmt::skip]
        let a = _mm512_setr_epi32(
            0, 1, -1, i32::MAX,
            i32::MIN, 100, -100, -32,
            0, 1, -1, i32::MAX,
            i32::MIN, 100, -100, -32,
        );
        let b = _mm512_set1_epi32(1);
        let r = _mm512_add_epi32(a, b);
        #[rustfmt::skip]
        let e = _mm512_setr_epi32(
            1, 2, 0, i32::MIN,
            i32::MIN + 1, 101, -99, -31,
            1, 2, 0, i32::MIN,
            i32::MIN + 1, 101, -99, -31,
        );
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_add_epi32() {
        #[rustfmt::skip]
        let a = _mm512_setr_epi32(
            0, 1, -1, i32::MAX,
            i32::MIN, 100, -100, -32,
            0, 1, -1, i32::MAX,
            i32::MIN, 100, -100, -32,
        );
        let b = _mm512_set1_epi32(1);
        let r = _mm512_mask_add_epi32(a, 0, a, b);
        assert_eq_m512i(r, a);
        let r = _mm512_mask_add_epi32(a, 0b00000000_11111111, a, b);
        #[rustfmt::skip]
        let e = _mm512_setr_epi32(
            1, 2, 0, i32::MIN,
            i32::MIN + 1, 101, -99, -31,
            0, 1, -1, i32::MAX,
            i32::MIN, 100, -100, -32,
        );
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_add_epi32() {
        #[rustfmt::skip]
        let a = _mm512_setr_epi32(
            0, 1, -1, i32::MAX,
            i32::MIN, 100, -100, -32,
            0, 1, -1, i32::MAX,
            i32::MIN, 100, -100, -32,
        );
        let b = _mm512_set1_epi32(1);
        let r = _mm512_maskz_add_epi32(0, a, b);
        assert_eq_m512i(r, _mm512_setzero_si512());
        let r = _mm512_maskz_add_epi32(0b00000000_11111111, a, b);
        #[rustfmt::skip]
        let e = _mm512_setr_epi32(
            1, 2, 0, i32::MIN,
            i32::MIN + 1, 101, -99, -31,
            0, 0, 0, 0,
            0, 0, 0, 0,
        );
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_add_epi32() {
        let a = _mm256_set_epi32(0, 1, -1, i32::MAX, i32::MIN, 100, -100, -32);
        let b = _mm256_set1_epi32(1);
        let r = _mm256_mask_add_epi32(a, 0, a, b);
        assert_eq_m256i(r, a);
        let r = _mm256_mask_add_epi32(a, 0b11111111, a, b);
        let e = _mm256_set_epi32(1, 2, 0, i32::MIN, i32::MIN + 1, 101, -99, -31);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_add_epi32() {
        let a = _mm256_setr_epi32(0, 1, -1, i32::MAX, i32::MIN, 100, -100, -32);
        let b = _mm256_set1_epi32(1);
        let r = _mm256_maskz_add_epi32(0, a, b);
        assert_eq_m256i(r, _mm256_setzero_si256());
        let r = _mm256_maskz_add_epi32(0b11111111, a, b);
        let e = _mm256_setr_epi32(1, 2, 0, i32::MIN, i32::MIN + 1, 101, -99, -31);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_add_epi32() {
        let a = _mm_set_epi32(1, -1, i32::MAX, i32::MIN);
        let b = _mm_set1_epi32(1);
        let r = _mm_mask_add_epi32(a, 0, a, b);
        assert_eq_m128i(r, a);
        let r = _mm_mask_add_epi32(a, 0b00001111, a, b);
        let e = _mm_set_epi32(2, 0, i32::MIN, i32::MIN + 1);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_add_epi32() {
        let a = _mm_setr_epi32(1, -1, i32::MAX, i32::MIN);
        let b = _mm_set1_epi32(1);
        let r = _mm_maskz_add_epi32(0, a, b);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm_maskz_add_epi32(0b00001111, a, b);
        let e = _mm_setr_epi32(2, 0, i32::MIN, i32::MIN + 1);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_add_ps() {
        #[rustfmt::skip]
        let a = _mm512_setr_ps(
            0., 1., -1., f32::MAX,
            f32::MIN, 100., -100., -32.,
            0., 1., -1., f32::MAX,
            f32::MIN, 100., -100., -32.,
        );
        let b = _mm512_set1_ps(1.);
        let r = _mm512_add_ps(a, b);
        #[rustfmt::skip]
        let e = _mm512_setr_ps(
            1., 2., 0., f32::MAX,
            f32::MIN + 1., 101., -99., -31.,
            1., 2., 0., f32::MAX,
            f32::MIN + 1., 101., -99., -31.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_add_ps() {
        #[rustfmt::skip]
        let a = _mm512_setr_ps(
            0., 1., -1., f32::MAX,
            f32::MIN, 100., -100., -32.,
            0., 1., -1., f32::MAX,
            f32::MIN, 100., -100., -32.,
        );
        let b = _mm512_set1_ps(1.);
        let r = _mm512_mask_add_ps(a, 0, a, b);
        assert_eq_m512(r, a);
        let r = _mm512_mask_add_ps(a, 0b00000000_11111111, a, b);
        #[rustfmt::skip]
        let e = _mm512_setr_ps(
            1., 2., 0., f32::MAX,
            f32::MIN + 1., 101., -99., -31.,
            0., 1., -1., f32::MAX,
            f32::MIN, 100., -100., -32.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_add_ps() {
        #[rustfmt::skip]
        let a = _mm512_setr_ps(
            0., 1., -1., f32::MAX,
            f32::MIN, 100., -100., -32.,
            0., 1., -1., f32::MAX,
            f32::MIN, 100., -100., -32.,
        );
        let b = _mm512_set1_ps(1.);
        let r = _mm512_maskz_add_ps(0, a, b);
        assert_eq_m512(r, _mm512_setzero_ps());
        let r = _mm512_maskz_add_ps(0b00000000_11111111, a, b);
        #[rustfmt::skip]
        let e = _mm512_setr_ps(
            1., 2., 0., f32::MAX,
            f32::MIN + 1., 101., -99., -31.,
            0., 0., 0., 0.,
            0., 0., 0., 0.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_add_ps() {
        let a = _mm256_set_ps(0., 1., -1., f32::MAX, f32::MIN, 100., -100., -32.);
        let b = _mm256_set1_ps(1.);
        let r = _mm256_mask_add_ps(a, 0, a, b);
        assert_eq_m256(r, a);
        let r = _mm256_mask_add_ps(a, 0b11111111, a, b);
        let e = _mm256_set_ps(1., 2., 0., f32::MAX, f32::MIN + 1., 101., -99., -31.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_add_ps() {
        let a = _mm256_set_ps(0., 1., -1., f32::MAX, f32::MIN, 100., -100., -32.);
        let b = _mm256_set1_ps(1.);
        let r = _mm256_maskz_add_ps(0, a, b);
        assert_eq_m256(r, _mm256_setzero_ps());
        let r = _mm256_maskz_add_ps(0b11111111, a, b);
        let e = _mm256_set_ps(1., 2., 0., f32::MAX, f32::MIN + 1., 101., -99., -31.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_add_ps() {
        let a = _mm_set_ps(1., -1., f32::MAX, f32::MIN);
        let b = _mm_set1_ps(1.);
        let r = _mm_mask_add_ps(a, 0, a, b);
        assert_eq_m128(r, a);
        let r = _mm_mask_add_ps(a, 0b00001111, a, b);
        let e = _mm_set_ps(2., 0., f32::MAX, f32::MIN + 1.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_add_ps() {
        let a = _mm_set_ps(1., -1., f32::MAX, f32::MIN);
        let b = _mm_set1_ps(1.);
        let r = _mm_maskz_add_ps(0, a, b);
        assert_eq_m128(r, _mm_setzero_ps());
        let r = _mm_maskz_add_ps(0b00001111, a, b);
        let e = _mm_set_ps(2., 0., f32::MAX, f32::MIN + 1.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_sub_epi32() {
        #[rustfmt::skip]
        let a = _mm512_setr_epi32(
            0, 1, -1, i32::MAX,
            i32::MIN, 100, -100, -32,
            0, 1, -1, i32::MAX,
            i32::MIN, 100, -100, -32,
        );
        let b = _mm512_set1_epi32(1);
        let r = _mm512_sub_epi32(a, b);
        #[rustfmt::skip]
        let e = _mm512_setr_epi32(
            -1, 0, -2, i32::MAX - 1,
            i32::MAX, 99, -101, -33,
            -1, 0, -2, i32::MAX - 1,
            i32::MAX, 99, -101, -33,
        );
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_sub_epi32() {
        #[rustfmt::skip]
        let a = _mm512_setr_epi32(
            0, 1, -1, i32::MAX,
            i32::MIN, 100, -100, -32,
            0, 1, -1, i32::MAX,
            i32::MIN, 100, -100, -32,
        );
        let b = _mm512_set1_epi32(1);
        let r = _mm512_mask_sub_epi32(a, 0, a, b);
        assert_eq_m512i(r, a);
        let r = _mm512_mask_sub_epi32(a, 0b00000000_11111111, a, b);
        #[rustfmt::skip]
        let e = _mm512_setr_epi32(
            -1, 0, -2, i32::MAX - 1,
            i32::MAX, 99, -101, -33,
            0, 1, -1, i32::MAX,
            i32::MIN, 100, -100, -32,
        );
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_sub_epi32() {
        #[rustfmt::skip]
        let a = _mm512_setr_epi32(
            0, 1, -1, i32::MAX,
            i32::MIN, 100, -100, -32,
            0, 1, -1, i32::MAX,
            i32::MIN, 100, -100, -32,
        );
        let b = _mm512_set1_epi32(1);
        let r = _mm512_maskz_sub_epi32(0, a, b);
        assert_eq_m512i(r, _mm512_setzero_si512());
        let r = _mm512_maskz_sub_epi32(0b00000000_11111111, a, b);
        #[rustfmt::skip]
        let e = _mm512_setr_epi32(
            -1, 0, -2, i32::MAX - 1,
            i32::MAX, 99, -101, -33,
            0, 0, 0, 0,
            0, 0, 0, 0,
        );
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_sub_epi32() {
        let a = _mm256_set_epi32(0, 1, -1, i32::MAX, i32::MIN, 100, -100, -32);
        let b = _mm256_set1_epi32(1);
        let r = _mm256_mask_sub_epi32(a, 0, a, b);
        assert_eq_m256i(r, a);
        let r = _mm256_mask_sub_epi32(a, 0b11111111, a, b);
        let e = _mm256_set_epi32(-1, 0, -2, i32::MAX - 1, i32::MAX, 99, -101, -33);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_sub_epi32() {
        let a = _mm256_set_epi32(0, 1, -1, i32::MAX, i32::MIN, 100, -100, -32);
        let b = _mm256_set1_epi32(1);
        let r = _mm256_maskz_sub_epi32(0, a, b);
        assert_eq_m256i(r, _mm256_setzero_si256());
        let r = _mm256_maskz_sub_epi32(0b11111111, a, b);
        let e = _mm256_set_epi32(-1, 0, -2, i32::MAX - 1, i32::MAX, 99, -101, -33);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_sub_epi32() {
        let a = _mm_set_epi32(1, -1, i32::MAX, i32::MIN);
        let b = _mm_set1_epi32(1);
        let r = _mm_mask_sub_epi32(a, 0, a, b);
        assert_eq_m128i(r, a);
        let r = _mm_mask_sub_epi32(a, 0b00001111, a, b);
        let e = _mm_set_epi32(0, -2, i32::MAX - 1, i32::MAX);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_sub_epi32() {
        let a = _mm_set_epi32(1, -1, i32::MAX, i32::MIN);
        let b = _mm_set1_epi32(1);
        let r = _mm_maskz_sub_epi32(0, a, b);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm_maskz_sub_epi32(0b00001111, a, b);
        let e = _mm_set_epi32(0, -2, i32::MAX - 1, i32::MAX);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_sub_ps() {
        #[rustfmt::skip]
        let a = _mm512_setr_ps(
            0., 1., -1., f32::MAX,
            f32::MIN, 100., -100., -32.,
            0., 1., -1., f32::MAX,
            f32::MIN, 100., -100., -32.,
        );
        let b = _mm512_set1_ps(1.);
        let r = _mm512_sub_ps(a, b);
        #[rustfmt::skip]
        let e = _mm512_setr_ps(
            -1., 0., -2., f32::MAX - 1.,
            f32::MIN, 99., -101., -33.,
            -1., 0., -2., f32::MAX - 1.,
            f32::MIN, 99., -101., -33.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_sub_ps() {
        #[rustfmt::skip]
        let a = _mm512_setr_ps(
            0., 1., -1., f32::MAX,
            f32::MIN, 100., -100., -32.,
            0., 1., -1., f32::MAX,
            f32::MIN, 100., -100., -32.,
        );
        let b = _mm512_set1_ps(1.);
        let r = _mm512_mask_sub_ps(a, 0, a, b);
        assert_eq_m512(r, a);
        let r = _mm512_mask_sub_ps(a, 0b00000000_11111111, a, b);
        #[rustfmt::skip]
        let e = _mm512_setr_ps(
            -1., 0., -2., f32::MAX - 1.,
            f32::MIN, 99., -101., -33.,
            0., 1., -1., f32::MAX,
            f32::MIN, 100., -100., -32.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_sub_ps() {
        #[rustfmt::skip]
        let a = _mm512_setr_ps(
            0., 1., -1., f32::MAX,
            f32::MIN, 100., -100., -32.,
            0., 1., -1., f32::MAX,
            f32::MIN, 100., -100., -32.,
        );
        let b = _mm512_set1_ps(1.);
        let r = _mm512_maskz_sub_ps(0, a, b);
        assert_eq_m512(r, _mm512_setzero_ps());
        let r = _mm512_maskz_sub_ps(0b00000000_11111111, a, b);
        #[rustfmt::skip]
        let e = _mm512_setr_ps(
            -1., 0., -2., f32::MAX - 1.,
            f32::MIN, 99., -101., -33.,
            0., 0., 0., 0.,
            0., 0., 0., 0.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_sub_ps() {
        let a = _mm256_set_ps(0., 1., -1., f32::MAX, f32::MIN, 100., -100., -32.);
        let b = _mm256_set1_ps(1.);
        let r = _mm256_mask_sub_ps(a, 0, a, b);
        assert_eq_m256(r, a);
        let r = _mm256_mask_sub_ps(a, 0b11111111, a, b);
        let e = _mm256_set_ps(-1., 0., -2., f32::MAX - 1., f32::MIN, 99., -101., -33.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_sub_ps() {
        let a = _mm256_set_ps(0., 1., -1., f32::MAX, f32::MIN, 100., -100., -32.);
        let b = _mm256_set1_ps(1.);
        let r = _mm256_maskz_sub_ps(0, a, b);
        assert_eq_m256(r, _mm256_setzero_ps());
        let r = _mm256_maskz_sub_ps(0b11111111, a, b);
        let e = _mm256_set_ps(-1., 0., -2., f32::MAX - 1., f32::MIN, 99., -101., -33.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_sub_ps() {
        let a = _mm_set_ps(1., -1., f32::MAX, f32::MIN);
        let b = _mm_set1_ps(1.);
        let r = _mm_mask_sub_ps(a, 0, a, b);
        assert_eq_m128(r, a);
        let r = _mm_mask_sub_ps(a, 0b00001111, a, b);
        let e = _mm_set_ps(0., -2., f32::MAX - 1., f32::MIN);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_sub_ps() {
        let a = _mm_set_ps(1., -1., f32::MAX, f32::MIN);
        let b = _mm_set1_ps(1.);
        let r = _mm_maskz_sub_ps(0, a, b);
        assert_eq_m128(r, _mm_setzero_ps());
        let r = _mm_maskz_sub_ps(0b00001111, a, b);
        let e = _mm_set_ps(0., -2., f32::MAX - 1., f32::MIN);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mullo_epi32() {
        #[rustfmt::skip]
        let a = _mm512_setr_epi32(
            0, 1, -1, i32::MAX,
            i32::MIN, 100, -100, -32,
            0, 1, -1, i32::MAX,
            i32::MIN, 100, -100, -32,
        );
        let b = _mm512_set1_epi32(2);
        let r = _mm512_mullo_epi32(a, b);
        let e = _mm512_setr_epi32(
            0, 2, -2, -2, 0, 200, -200, -64, 0, 2, -2, -2, 0, 200, -200, -64,
        );
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_mullo_epi32() {
        #[rustfmt::skip]
        let a = _mm512_setr_epi32(
            0, 1, -1, i32::MAX,
            i32::MIN, 100, -100, -32,
            0, 1, -1, i32::MAX,
            i32::MIN, 100, -100, -32,
        );
        let b = _mm512_set1_epi32(2);
        let r = _mm512_mask_mullo_epi32(a, 0, a, b);
        assert_eq_m512i(r, a);
        let r = _mm512_mask_mullo_epi32(a, 0b00000000_11111111, a, b);
        #[rustfmt::skip]
        let e = _mm512_setr_epi32(
            0, 2, -2, -2,
            0, 200, -200, -64,
            0, 1, -1, i32::MAX,
            i32::MIN, 100, -100, -32,
        );
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_mullo_epi32() {
        #[rustfmt::skip]
        let a = _mm512_setr_epi32(
            0, 1, -1, i32::MAX,
            i32::MIN, 100, -100, -32,
            0, 1, -1, i32::MAX,
            i32::MIN, 100, -100, -32,
        );
        let b = _mm512_set1_epi32(2);
        let r = _mm512_maskz_mullo_epi32(0, a, b);
        assert_eq_m512i(r, _mm512_setzero_si512());
        let r = _mm512_maskz_mullo_epi32(0b00000000_11111111, a, b);
        let e = _mm512_setr_epi32(0, 2, -2, -2, 0, 200, -200, -64, 0, 0, 0, 0, 0, 0, 0, 0);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_mullo_epi32() {
        let a = _mm256_set_epi32(0, 1, -1, i32::MAX, i32::MIN, 100, -100, -32);
        let b = _mm256_set1_epi32(2);
        let r = _mm256_mask_mullo_epi32(a, 0, a, b);
        assert_eq_m256i(r, a);
        let r = _mm256_mask_mullo_epi32(a, 0b11111111, a, b);
        let e = _mm256_set_epi32(0, 2, -2, -2, 0, 200, -200, -64);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_mullo_epi32() {
        let a = _mm256_set_epi32(0, 1, -1, i32::MAX, i32::MIN, 100, -100, -32);
        let b = _mm256_set1_epi32(2);
        let r = _mm256_maskz_mullo_epi32(0, a, b);
        assert_eq_m256i(r, _mm256_setzero_si256());
        let r = _mm256_maskz_mullo_epi32(0b11111111, a, b);
        let e = _mm256_set_epi32(0, 2, -2, -2, 0, 200, -200, -64);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_mullo_epi32() {
        let a = _mm_set_epi32(1, -1, i32::MAX, i32::MIN);
        let b = _mm_set1_epi32(2);
        let r = _mm_mask_mullo_epi32(a, 0, a, b);
        assert_eq_m128i(r, a);
        let r = _mm_mask_mullo_epi32(a, 0b00001111, a, b);
        let e = _mm_set_epi32(2, -2, -2, 0);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_mullo_epi32() {
        let a = _mm_set_epi32(1, -1, i32::MAX, i32::MIN);
        let b = _mm_set1_epi32(2);
        let r = _mm_maskz_mullo_epi32(0, a, b);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm_maskz_mullo_epi32(0b00001111, a, b);
        let e = _mm_set_epi32(2, -2, -2, 0);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mul_ps() {
        #[rustfmt::skip]
        let a = _mm512_setr_ps(
            0., 1., -1., f32::MAX,
            f32::MIN, 100., -100., -32.,
            0., 1., -1., f32::MAX,
            f32::MIN, 100., -100., -32.,
        );
        let b = _mm512_set1_ps(2.);
        let r = _mm512_mul_ps(a, b);
        #[rustfmt::skip]
        let e = _mm512_setr_ps(
            0., 2., -2., f32::INFINITY,
            f32::NEG_INFINITY, 200., -200., -64.,
            0., 2., -2., f32::INFINITY,
            f32::NEG_INFINITY, 200., -200.,
            -64.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_mul_ps() {
        #[rustfmt::skip]
        let a = _mm512_setr_ps(
            0., 1., -1., f32::MAX,
            f32::MIN, 100., -100., -32.,
            0., 1., -1., f32::MAX,
            f32::MIN, 100., -100., -32.,
        );
        let b = _mm512_set1_ps(2.);
        let r = _mm512_mask_mul_ps(a, 0, a, b);
        assert_eq_m512(r, a);
        let r = _mm512_mask_mul_ps(a, 0b00000000_11111111, a, b);
        #[rustfmt::skip]
        let e = _mm512_setr_ps(
            0., 2., -2., f32::INFINITY,
            f32::NEG_INFINITY, 200., -200., -64.,
            0., 1., -1., f32::MAX,
            f32::MIN, 100., -100., -32.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_mul_ps() {
        #[rustfmt::skip]
        let a = _mm512_setr_ps(
            0., 1., -1., f32::MAX,
            f32::MIN, 100., -100., -32.,
            0., 1., -1., f32::MAX,
            f32::MIN, 100., -100., -32.,
        );
        let b = _mm512_set1_ps(2.);
        let r = _mm512_maskz_mul_ps(0, a, b);
        assert_eq_m512(r, _mm512_setzero_ps());
        let r = _mm512_maskz_mul_ps(0b00000000_11111111, a, b);
        #[rustfmt::skip]
        let e = _mm512_setr_ps(
            0., 2., -2., f32::INFINITY,
            f32::NEG_INFINITY, 200., -200., -64.,
            0., 0., 0., 0.,
            0., 0., 0., 0.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_mul_ps() {
        let a = _mm256_set_ps(0., 1., -1., f32::MAX, f32::MIN, 100., -100., -32.);
        let b = _mm256_set1_ps(2.);
        let r = _mm256_mask_mul_ps(a, 0, a, b);
        assert_eq_m256(r, a);
        let r = _mm256_mask_mul_ps(a, 0b11111111, a, b);
        #[rustfmt::skip]
        let e = _mm256_set_ps(
            0., 2., -2., f32::INFINITY,
            f32::NEG_INFINITY, 200., -200., -64.,
        );
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_mul_ps() {
        let a = _mm256_set_ps(0., 1., -1., f32::MAX, f32::MIN, 100., -100., -32.);
        let b = _mm256_set1_ps(2.);
        let r = _mm256_maskz_mul_ps(0, a, b);
        assert_eq_m256(r, _mm256_setzero_ps());
        let r = _mm256_maskz_mul_ps(0b11111111, a, b);
        #[rustfmt::skip]
        let e = _mm256_set_ps(
            0., 2., -2., f32::INFINITY,
            f32::NEG_INFINITY, 200., -200., -64.,
        );
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_mul_ps() {
        let a = _mm_set_ps(1., -1., f32::MAX, f32::MIN);
        let b = _mm_set1_ps(2.);
        let r = _mm_mask_mul_ps(a, 0, a, b);
        assert_eq_m128(r, a);
        let r = _mm_mask_mul_ps(a, 0b00001111, a, b);
        let e = _mm_set_ps(2., -2., f32::INFINITY, f32::NEG_INFINITY);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_mul_ps() {
        let a = _mm_set_ps(1., -1., f32::MAX, f32::MIN);
        let b = _mm_set1_ps(2.);
        let r = _mm_maskz_mul_ps(0, a, b);
        assert_eq_m128(r, _mm_setzero_ps());
        let r = _mm_maskz_mul_ps(0b00001111, a, b);
        let e = _mm_set_ps(2., -2., f32::INFINITY, f32::NEG_INFINITY);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_div_ps() {
        let a = _mm512_setr_ps(
            0., 1., -1., -2., 100., 100., -100., -32., 0., 1., -1., 1000., -131., 100., -100., -32.,
        );
        let b = _mm512_setr_ps(
            2., 2., 2., 2., 2., 0., 2., 2., 2., 2., 2., 2., 0., 2., 2., 2.,
        );
        let r = _mm512_div_ps(a, b);
        #[rustfmt::skip]
        let e = _mm512_setr_ps(
            0., 0.5, -0.5, -1.,
            50., f32::INFINITY, -50., -16.,
            0., 0.5, -0.5, 500.,
            f32::NEG_INFINITY, 50., -50., -16.,
        );
        assert_eq_m512(r, e); // 0/0 = NAN
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_div_ps() {
        let a = _mm512_setr_ps(
            0., 1., -1., -2., 100., 100., -100., -32., 0., 1., -1., 1000., -131., 100., -100., -32.,
        );
        let b = _mm512_setr_ps(
            2., 2., 2., 2., 2., 0., 2., 2., 2., 2., 2., 2., 0., 2., 2., 2.,
        );
        let r = _mm512_mask_div_ps(a, 0, a, b);
        assert_eq_m512(r, a);
        let r = _mm512_mask_div_ps(a, 0b00000000_11111111, a, b);
        #[rustfmt::skip]
        let e = _mm512_setr_ps(
            0., 0.5, -0.5, -1.,
            50., f32::INFINITY, -50., -16.,
            0., 1., -1., 1000.,
            -131., 100., -100., -32.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_div_ps() {
        let a = _mm512_setr_ps(
            0., 1., -1., -2., 100., 100., -100., -32., 0., 1., -1., 1000., -131., 100., -100., -32.,
        );
        let b = _mm512_setr_ps(
            2., 2., 2., 2., 2., 0., 2., 2., 2., 2., 2., 2., 0., 2., 2., 2.,
        );
        let r = _mm512_maskz_div_ps(0, a, b);
        assert_eq_m512(r, _mm512_setzero_ps());
        let r = _mm512_maskz_div_ps(0b00000000_11111111, a, b);
        #[rustfmt::skip]
        let e = _mm512_setr_ps(
            0., 0.5, -0.5, -1.,
            50., f32::INFINITY, -50., -16.,
            0., 0., 0., 0.,
            0., 0., 0., 0.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_div_ps() {
        let a = _mm256_set_ps(0., 1., -1., -2., 100., 100., -100., -32.);
        let b = _mm256_set_ps(2., 2., 2., 2., 2., 0., 2., 2.);
        let r = _mm256_mask_div_ps(a, 0, a, b);
        assert_eq_m256(r, a);
        let r = _mm256_mask_div_ps(a, 0b11111111, a, b);
        let e = _mm256_set_ps(0., 0.5, -0.5, -1., 50., f32::INFINITY, -50., -16.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_div_ps() {
        let a = _mm256_set_ps(0., 1., -1., -2., 100., 100., -100., -32.);
        let b = _mm256_set_ps(2., 2., 2., 2., 2., 0., 2., 2.);
        let r = _mm256_maskz_div_ps(0, a, b);
        assert_eq_m256(r, _mm256_setzero_ps());
        let r = _mm256_maskz_div_ps(0b11111111, a, b);
        let e = _mm256_set_ps(0., 0.5, -0.5, -1., 50., f32::INFINITY, -50., -16.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_div_ps() {
        let a = _mm_set_ps(100., 100., -100., -32.);
        let b = _mm_set_ps(2., 0., 2., 2.);
        let r = _mm_mask_div_ps(a, 0, a, b);
        assert_eq_m128(r, a);
        let r = _mm_mask_div_ps(a, 0b00001111, a, b);
        let e = _mm_set_ps(50., f32::INFINITY, -50., -16.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_div_ps() {
        let a = _mm_set_ps(100., 100., -100., -32.);
        let b = _mm_set_ps(2., 0., 2., 2.);
        let r = _mm_maskz_div_ps(0, a, b);
        assert_eq_m128(r, _mm_setzero_ps());
        let r = _mm_maskz_div_ps(0b00001111, a, b);
        let e = _mm_set_ps(50., f32::INFINITY, -50., -16.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_max_epi32() {
        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
        let r = _mm512_max_epi32(a, b);
        let e = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_max_epi32() {
        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
        let r = _mm512_mask_max_epi32(a, 0, a, b);
        assert_eq_m512i(r, a);
        let r = _mm512_mask_max_epi32(a, 0b00000000_11111111, a, b);
        let e = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_max_epi32() {
        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
        let r = _mm512_maskz_max_epi32(0, a, b);
        assert_eq_m512i(r, _mm512_setzero_si512());
        let r = _mm512_maskz_max_epi32(0b00000000_11111111, a, b);
        let e = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 0, 0, 0, 0, 0, 0, 0, 0);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_max_epi32() {
        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
        let b = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
        let r = _mm256_mask_max_epi32(a, 0, a, b);
        assert_eq_m256i(r, a);
        let r = _mm256_mask_max_epi32(a, 0b11111111, a, b);
        let e = _mm256_set_epi32(7, 6, 5, 4, 4, 5, 6, 7);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_max_epi32() {
        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
        let b = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
        let r = _mm256_maskz_max_epi32(0, a, b);
        assert_eq_m256i(r, _mm256_setzero_si256());
        let r = _mm256_maskz_max_epi32(0b11111111, a, b);
        let e = _mm256_set_epi32(7, 6, 5, 4, 4, 5, 6, 7);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_max_epi32() {
        let a = _mm_set_epi32(0, 1, 2, 3);
        let b = _mm_set_epi32(3, 2, 1, 0);
        let r = _mm_mask_max_epi32(a, 0, a, b);
        assert_eq_m128i(r, a);
        let r = _mm_mask_max_epi32(a, 0b00001111, a, b);
        let e = _mm_set_epi32(3, 2, 2, 3);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_max_epi32() {
        let a = _mm_set_epi32(0, 1, 2, 3);
        let b = _mm_set_epi32(3, 2, 1, 0);
        let r = _mm_maskz_max_epi32(0, a, b);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm_maskz_max_epi32(0b00001111, a, b);
        let e = _mm_set_epi32(3, 2, 2, 3);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_max_ps() {
        let a = _mm512_setr_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        let b = _mm512_setr_ps(
            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
        );
        let r = _mm512_max_ps(a, b);
        let e = _mm512_setr_ps(
            15., 14., 13., 12., 11., 10., 9., 8., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_max_ps() {
        let a = _mm512_setr_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        let b = _mm512_setr_ps(
            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
        );
        let r = _mm512_mask_max_ps(a, 0, a, b);
        assert_eq_m512(r, a);
        let r = _mm512_mask_max_ps(a, 0b00000000_11111111, a, b);
        let e = _mm512_setr_ps(
            15., 14., 13., 12., 11., 10., 9., 8., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_max_ps() {
        let a = _mm512_setr_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        let b = _mm512_setr_ps(
            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
        );
        let r = _mm512_maskz_max_ps(0, a, b);
        assert_eq_m512(r, _mm512_setzero_ps());
        let r = _mm512_maskz_max_ps(0b00000000_11111111, a, b);
        let e = _mm512_setr_ps(
            15., 14., 13., 12., 11., 10., 9., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_max_ps() {
        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
        let b = _mm256_set_ps(7., 6., 5., 4., 3., 2., 1., 0.);
        let r = _mm256_mask_max_ps(a, 0, a, b);
        assert_eq_m256(r, a);
        let r = _mm256_mask_max_ps(a, 0b11111111, a, b);
        let e = _mm256_set_ps(7., 6., 5., 4., 4., 5., 6., 7.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_max_ps() {
        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
        let b = _mm256_set_ps(7., 6., 5., 4., 3., 2., 1., 0.);
        let r = _mm256_maskz_max_ps(0, a, b);
        assert_eq_m256(r, _mm256_setzero_ps());
        let r = _mm256_maskz_max_ps(0b11111111, a, b);
        let e = _mm256_set_ps(7., 6., 5., 4., 4., 5., 6., 7.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_max_ps() {
        let a = _mm_set_ps(0., 1., 2., 3.);
        let b = _mm_set_ps(3., 2., 1., 0.);
        let r = _mm_mask_max_ps(a, 0, a, b);
        assert_eq_m128(r, a);
        let r = _mm_mask_max_ps(a, 0b00001111, a, b);
        let e = _mm_set_ps(3., 2., 2., 3.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_max_ps() {
        let a = _mm_set_ps(0., 1., 2., 3.);
        let b = _mm_set_ps(3., 2., 1., 0.);
        let r = _mm_maskz_max_ps(0, a, b);
        assert_eq_m128(r, _mm_setzero_ps());
        let r = _mm_mask_max_ps(a, 0b00001111, a, b);
        let e = _mm_set_ps(3., 2., 2., 3.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_max_epu32() {
        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
        let r = _mm512_max_epu32(a, b);
        let e = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_max_epu32() {
        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
        let r = _mm512_mask_max_epu32(a, 0, a, b);
        assert_eq_m512i(r, a);
        let r = _mm512_mask_max_epu32(a, 0b00000000_11111111, a, b);
        let e = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 8, 9, 10, 11, 12, 13, 14, 15);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_max_epu32() {
        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
        let r = _mm512_maskz_max_epu32(0, a, b);
        assert_eq_m512i(r, _mm512_setzero_si512());
        let r = _mm512_maskz_max_epu32(0b00000000_11111111, a, b);
        let e = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 0, 0, 0, 0, 0, 0, 0, 0);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_max_epu32() {
        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
        let b = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
        let r = _mm256_mask_max_epu32(a, 0, a, b);
        assert_eq_m256i(r, a);
        let r = _mm256_mask_max_epu32(a, 0b11111111, a, b);
        let e = _mm256_set_epi32(7, 6, 5, 4, 4, 5, 6, 7);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_max_epu32() {
        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
        let b = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
        let r = _mm256_maskz_max_epu32(0, a, b);
        assert_eq_m256i(r, _mm256_setzero_si256());
        let r = _mm256_maskz_max_epu32(0b11111111, a, b);
        let e = _mm256_set_epi32(7, 6, 5, 4, 4, 5, 6, 7);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_max_epu32() {
        let a = _mm_set_epi32(0, 1, 2, 3);
        let b = _mm_set_epi32(3, 2, 1, 0);
        let r = _mm_mask_max_epu32(a, 0, a, b);
        assert_eq_m128i(r, a);
        let r = _mm_mask_max_epu32(a, 0b00001111, a, b);
        let e = _mm_set_epi32(3, 2, 2, 3);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_max_epu32() {
        let a = _mm_set_epi32(0, 1, 2, 3);
        let b = _mm_set_epi32(3, 2, 1, 0);
        let r = _mm_maskz_max_epu32(0, a, b);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm_maskz_max_epu32(0b00001111, a, b);
        let e = _mm_set_epi32(3, 2, 2, 3);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_min_epi32() {
        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
        let r = _mm512_min_epi32(a, b);
        let e = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_min_epi32() {
        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
        let r = _mm512_mask_min_epi32(a, 0, a, b);
        assert_eq_m512i(r, a);
        let r = _mm512_mask_min_epi32(a, 0b00000000_11111111, a, b);
        let e = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_min_epi32() {
        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
        let r = _mm512_maskz_min_epi32(0, a, b);
        assert_eq_m512i(r, _mm512_setzero_si512());
        let r = _mm512_maskz_min_epi32(0b00000000_11111111, a, b);
        let e = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_min_epi32() {
        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
        let b = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
        let r = _mm256_mask_min_epi32(a, 0, a, b);
        assert_eq_m256i(r, a);
        let r = _mm256_mask_min_epi32(a, 0b11111111, a, b);
        let e = _mm256_set_epi32(0, 1, 2, 3, 3, 2, 1, 0);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_min_epi32() {
        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
        let b = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
        let r = _mm256_maskz_min_epi32(0, a, b);
        assert_eq_m256i(r, _mm256_setzero_si256());
        let r = _mm256_maskz_min_epi32(0b11111111, a, b);
        let e = _mm256_set_epi32(0, 1, 2, 3, 3, 2, 1, 0);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_min_epi32() {
        let a = _mm_set_epi32(0, 1, 2, 3);
        let b = _mm_set_epi32(3, 2, 1, 0);
        let r = _mm_mask_min_epi32(a, 0, a, b);
        assert_eq_m128i(r, a);
        let r = _mm_mask_min_epi32(a, 0b00001111, a, b);
        let e = _mm_set_epi32(0, 1, 1, 0);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_min_epi32() {
        let a = _mm_set_epi32(0, 1, 2, 3);
        let b = _mm_set_epi32(3, 2, 1, 0);
        let r = _mm_maskz_min_epi32(0, a, b);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm_maskz_min_epi32(0b00001111, a, b);
        let e = _mm_set_epi32(0, 1, 1, 0);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_min_ps() {
        let a = _mm512_setr_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        let b = _mm512_setr_ps(
            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
        );
        let r = _mm512_min_ps(a, b);
        let e = _mm512_setr_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 7., 6., 5., 4., 3., 2., 1., 0.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_min_ps() {
        let a = _mm512_setr_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        let b = _mm512_setr_ps(
            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
        );
        let r = _mm512_mask_min_ps(a, 0, a, b);
        assert_eq_m512(r, a);
        let r = _mm512_mask_min_ps(a, 0b00000000_11111111, a, b);
        let e = _mm512_setr_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_min_ps() {
        let a = _mm512_setr_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        let b = _mm512_setr_ps(
            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
        );
        let r = _mm512_maskz_min_ps(0, a, b);
        assert_eq_m512(r, _mm512_setzero_ps());
        let r = _mm512_maskz_min_ps(0b00000000_11111111, a, b);
        let e = _mm512_setr_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 0., 0., 0., 0., 0., 0., 0., 0.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_min_ps() {
        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
        let b = _mm256_set_ps(7., 6., 5., 4., 3., 2., 1., 0.);
        let r = _mm256_mask_min_ps(a, 0, a, b);
        assert_eq_m256(r, a);
        let r = _mm256_mask_min_ps(a, 0b11111111, a, b);
        let e = _mm256_set_ps(0., 1., 2., 3., 3., 2., 1., 0.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_min_ps() {
        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
        let b = _mm256_set_ps(7., 6., 5., 4., 3., 2., 1., 0.);
        let r = _mm256_maskz_min_ps(0, a, b);
        assert_eq_m256(r, _mm256_setzero_ps());
        let r = _mm256_maskz_min_ps(0b11111111, a, b);
        let e = _mm256_set_ps(0., 1., 2., 3., 3., 2., 1., 0.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_min_ps() {
        let a = _mm_set_ps(0., 1., 2., 3.);
        let b = _mm_set_ps(3., 2., 1., 0.);
        let r = _mm_mask_min_ps(a, 0, a, b);
        assert_eq_m128(r, a);
        let r = _mm_mask_min_ps(a, 0b00001111, a, b);
        let e = _mm_set_ps(0., 1., 1., 0.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_min_ps() {
        let a = _mm_set_ps(0., 1., 2., 3.);
        let b = _mm_set_ps(3., 2., 1., 0.);
        let r = _mm_maskz_min_ps(0, a, b);
        assert_eq_m128(r, _mm_setzero_ps());
        let r = _mm_maskz_min_ps(0b00001111, a, b);
        let e = _mm_set_ps(0., 1., 1., 0.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_min_epu32() {
        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
        let r = _mm512_min_epu32(a, b);
        let e = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 7, 6, 5, 4, 3, 2, 1, 0);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_min_epu32() {
        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
        let r = _mm512_mask_min_epu32(a, 0, a, b);
        assert_eq_m512i(r, a);
        let r = _mm512_mask_min_epu32(a, 0b00000000_11111111, a, b);
        let e = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_min_epu32() {
        let a = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let b = _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
        let r = _mm512_maskz_min_epu32(0, a, b);
        assert_eq_m512i(r, _mm512_setzero_si512());
        let r = _mm512_maskz_min_epu32(0b00000000_11111111, a, b);
        let e = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_min_epu32() {
        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
        let b = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
        let r = _mm256_mask_min_epu32(a, 0, a, b);
        assert_eq_m256i(r, a);
        let r = _mm256_mask_min_epu32(a, 0b11111111, a, b);
        let e = _mm256_set_epi32(0, 1, 2, 3, 3, 2, 1, 0);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_min_epu32() {
        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
        let b = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
        let r = _mm256_maskz_min_epu32(0, a, b);
        assert_eq_m256i(r, _mm256_setzero_si256());
        let r = _mm256_maskz_min_epu32(0b11111111, a, b);
        let e = _mm256_set_epi32(0, 1, 2, 3, 3, 2, 1, 0);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_min_epu32() {
        let a = _mm_set_epi32(0, 1, 2, 3);
        let b = _mm_set_epi32(3, 2, 1, 0);
        let r = _mm_mask_min_epu32(a, 0, a, b);
        assert_eq_m128i(r, a);
        let r = _mm_mask_min_epu32(a, 0b00001111, a, b);
        let e = _mm_set_epi32(0, 1, 1, 0);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_min_epu32() {
        let a = _mm_set_epi32(0, 1, 2, 3);
        let b = _mm_set_epi32(3, 2, 1, 0);
        let r = _mm_maskz_min_epu32(0, a, b);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm_maskz_min_epu32(0b00001111, a, b);
        let e = _mm_set_epi32(0, 1, 1, 0);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_sqrt_ps() {
        let a = _mm512_setr_ps(
            0., 1., 4., 9., 16., 25., 36., 49., 64., 81., 100., 121., 144., 169., 196., 225.,
        );
        let r = _mm512_sqrt_ps(a);
        let e = _mm512_setr_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_sqrt_ps() {
        let a = _mm512_setr_ps(
            0., 1., 4., 9., 16., 25., 36., 49., 64., 81., 100., 121., 144., 169., 196., 225.,
        );
        let r = _mm512_mask_sqrt_ps(a, 0, a);
        assert_eq_m512(r, a);
        let r = _mm512_mask_sqrt_ps(a, 0b00000000_11111111, a);
        let e = _mm512_setr_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 64., 81., 100., 121., 144., 169., 196., 225.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_sqrt_ps() {
        let a = _mm512_setr_ps(
            0., 1., 4., 9., 16., 25., 36., 49., 64., 81., 100., 121., 144., 169., 196., 225.,
        );
        let r = _mm512_maskz_sqrt_ps(0, a);
        assert_eq_m512(r, _mm512_setzero_ps());
        let r = _mm512_maskz_sqrt_ps(0b00000000_11111111, a);
        let e = _mm512_setr_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 0., 0., 0., 0., 0., 0., 0., 0.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_sqrt_ps() {
        let a = _mm256_set_ps(0., 1., 4., 9., 16., 25., 36., 49.);
        let r = _mm256_mask_sqrt_ps(a, 0, a);
        assert_eq_m256(r, a);
        let r = _mm256_mask_sqrt_ps(a, 0b11111111, a);
        let e = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_sqrt_ps() {
        let a = _mm256_set_ps(0., 1., 4., 9., 16., 25., 36., 49.);
        let r = _mm256_maskz_sqrt_ps(0, a);
        assert_eq_m256(r, _mm256_setzero_ps());
        let r = _mm256_maskz_sqrt_ps(0b11111111, a);
        let e = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_sqrt_ps() {
        let a = _mm_set_ps(0., 1., 4., 9.);
        let r = _mm_mask_sqrt_ps(a, 0, a);
        assert_eq_m128(r, a);
        let r = _mm_mask_sqrt_ps(a, 0b00001111, a);
        let e = _mm_set_ps(0., 1., 2., 3.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_sqrt_ps() {
        let a = _mm_set_ps(0., 1., 4., 9.);
        let r = _mm_maskz_sqrt_ps(0, a);
        assert_eq_m128(r, _mm_setzero_ps());
        let r = _mm_maskz_sqrt_ps(0b00001111, a);
        let e = _mm_set_ps(0., 1., 2., 3.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_fmadd_ps() {
        let a = _mm512_set1_ps(1.);
        let b = _mm512_setr_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        let c = _mm512_set1_ps(1.);
        let r = _mm512_fmadd_ps(a, b, c);
        let e = _mm512_setr_ps(
            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_fmadd_ps() {
        let a = _mm512_set1_ps(1.);
        let b = _mm512_setr_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        let c = _mm512_set1_ps(1.);
        let r = _mm512_mask_fmadd_ps(a, 0, b, c);
        assert_eq_m512(r, a);
        let r = _mm512_mask_fmadd_ps(a, 0b00000000_11111111, b, c);
        let e = _mm512_setr_ps(
            1., 2., 3., 4., 5., 6., 7., 8., 1., 1., 1., 1., 1., 1., 1., 1.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_fmadd_ps() {
        let a = _mm512_set1_ps(1.);
        let b = _mm512_setr_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        let c = _mm512_set1_ps(1.);
        let r = _mm512_maskz_fmadd_ps(0, a, b, c);
        assert_eq_m512(r, _mm512_setzero_ps());
        let r = _mm512_maskz_fmadd_ps(0b00000000_11111111, a, b, c);
        let e = _mm512_setr_ps(
            1., 2., 3., 4., 5., 6., 7., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask3_fmadd_ps() {
        let a = _mm512_set1_ps(1.);
        let b = _mm512_setr_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        let c = _mm512_set1_ps(2.);
        let r = _mm512_mask3_fmadd_ps(a, b, c, 0);
        assert_eq_m512(r, c);
        let r = _mm512_mask3_fmadd_ps(a, b, c, 0b00000000_11111111);
        let e = _mm512_setr_ps(
            2., 3., 4., 5., 6., 7., 8., 9., 2., 2., 2., 2., 2., 2., 2., 2.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_fmadd_ps() {
        let a = _mm256_set1_ps(1.);
        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
        let c = _mm256_set1_ps(1.);
        let r = _mm256_mask_fmadd_ps(a, 0, b, c);
        assert_eq_m256(r, a);
        let r = _mm256_mask_fmadd_ps(a, 0b11111111, b, c);
        let e = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_fmadd_ps() {
        let a = _mm256_set1_ps(1.);
        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
        let c = _mm256_set1_ps(1.);
        let r = _mm256_maskz_fmadd_ps(0, a, b, c);
        assert_eq_m256(r, _mm256_setzero_ps());
        let r = _mm256_maskz_fmadd_ps(0b11111111, a, b, c);
        let e = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask3_fmadd_ps() {
        let a = _mm256_set1_ps(1.);
        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
        let c = _mm256_set1_ps(1.);
        let r = _mm256_mask3_fmadd_ps(a, b, c, 0);
        assert_eq_m256(r, c);
        let r = _mm256_mask3_fmadd_ps(a, b, c, 0b11111111);
        let e = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_fmadd_ps() {
        let a = _mm_set1_ps(1.);
        let b = _mm_set_ps(0., 1., 2., 3.);
        let c = _mm_set1_ps(1.);
        let r = _mm_mask_fmadd_ps(a, 0, b, c);
        assert_eq_m128(r, a);
        let r = _mm_mask_fmadd_ps(a, 0b00001111, b, c);
        let e = _mm_set_ps(1., 2., 3., 4.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_fmadd_ps() {
        let a = _mm_set1_ps(1.);
        let b = _mm_set_ps(0., 1., 2., 3.);
        let c = _mm_set1_ps(1.);
        let r = _mm_maskz_fmadd_ps(0, a, b, c);
        assert_eq_m128(r, _mm_setzero_ps());
        let r = _mm_maskz_fmadd_ps(0b00001111, a, b, c);
        let e = _mm_set_ps(1., 2., 3., 4.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask3_fmadd_ps() {
        let a = _mm_set1_ps(1.);
        let b = _mm_set_ps(0., 1., 2., 3.);
        let c = _mm_set1_ps(1.);
        let r = _mm_mask3_fmadd_ps(a, b, c, 0);
        assert_eq_m128(r, c);
        let r = _mm_mask3_fmadd_ps(a, b, c, 0b00001111);
        let e = _mm_set_ps(1., 2., 3., 4.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_fmsub_ps() {
        let a = _mm512_setr_ps(
            1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        );
        let b = _mm512_setr_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        let c = _mm512_setr_ps(
            1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        );
        let r = _mm512_fmsub_ps(a, b, c);
        let e = _mm512_setr_ps(
            -1., 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_fmsub_ps() {
        let a = _mm512_set1_ps(1.);
        let b = _mm512_setr_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        let c = _mm512_set1_ps(1.);
        let r = _mm512_mask_fmsub_ps(a, 0, b, c);
        assert_eq_m512(r, a);
        let r = _mm512_mask_fmsub_ps(a, 0b00000000_11111111, b, c);
        let e = _mm512_setr_ps(
            -1., 0., 1., 2., 3., 4., 5., 6., 1., 1., 1., 1., 1., 1., 1., 1.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_fmsub_ps() {
        let a = _mm512_set1_ps(1.);
        let b = _mm512_setr_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        let c = _mm512_set1_ps(1.);
        let r = _mm512_maskz_fmsub_ps(0, a, b, c);
        assert_eq_m512(r, _mm512_setzero_ps());
        let r = _mm512_maskz_fmsub_ps(0b00000000_11111111, a, b, c);
        let e = _mm512_setr_ps(
            -1., 0., 1., 2., 3., 4., 5., 6., 0., 0., 0., 0., 0., 0., 0., 0.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask3_fmsub_ps() {
        let a = _mm512_set1_ps(1.);
        let b = _mm512_setr_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        let c = _mm512_setr_ps(
            1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2.,
        );
        let r = _mm512_mask3_fmsub_ps(a, b, c, 0);
        assert_eq_m512(r, c);
        let r = _mm512_mask3_fmsub_ps(a, b, c, 0b00000000_11111111);
        let e = _mm512_setr_ps(
            -1., 0., 1., 2., 3., 4., 5., 6., 2., 2., 2., 2., 2., 2., 2., 2.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_fmsub_ps() {
        let a = _mm256_set1_ps(1.);
        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
        let c = _mm256_set1_ps(1.);
        let r = _mm256_mask_fmsub_ps(a, 0, b, c);
        assert_eq_m256(r, a);
        let r = _mm256_mask_fmsub_ps(a, 0b11111111, b, c);
        let e = _mm256_set_ps(-1., 0., 1., 2., 3., 4., 5., 6.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_fmsub_ps() {
        let a = _mm256_set1_ps(1.);
        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
        let c = _mm256_set1_ps(1.);
        let r = _mm256_maskz_fmsub_ps(0, a, b, c);
        assert_eq_m256(r, _mm256_setzero_ps());
        let r = _mm256_maskz_fmsub_ps(0b11111111, a, b, c);
        let e = _mm256_set_ps(-1., 0., 1., 2., 3., 4., 5., 6.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask3_fmsub_ps() {
        let a = _mm256_set1_ps(1.);
        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
        let c = _mm256_set1_ps(1.);
        let r = _mm256_mask3_fmsub_ps(a, b, c, 0);
        assert_eq_m256(r, c);
        let r = _mm256_mask3_fmsub_ps(a, b, c, 0b11111111);
        let e = _mm256_set_ps(-1., 0., 1., 2., 3., 4., 5., 6.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_fmsub_ps() {
        let a = _mm_set1_ps(1.);
        let b = _mm_set_ps(0., 1., 2., 3.);
        let c = _mm_set1_ps(1.);
        let r = _mm_mask_fmsub_ps(a, 0, b, c);
        assert_eq_m128(r, a);
        let r = _mm_mask_fmsub_ps(a, 0b00001111, b, c);
        let e = _mm_set_ps(-1., 0., 1., 2.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_fmsub_ps() {
        let a = _mm_set1_ps(1.);
        let b = _mm_set_ps(0., 1., 2., 3.);
        let c = _mm_set1_ps(1.);
        let r = _mm_maskz_fmsub_ps(0, a, b, c);
        assert_eq_m128(r, _mm_setzero_ps());
        let r = _mm_maskz_fmsub_ps(0b00001111, a, b, c);
        let e = _mm_set_ps(-1., 0., 1., 2.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask3_fmsub_ps() {
        let a = _mm_set1_ps(1.);
        let b = _mm_set_ps(0., 1., 2., 3.);
        let c = _mm_set1_ps(1.);
        let r = _mm_mask3_fmsub_ps(a, b, c, 0);
        assert_eq_m128(r, c);
        let r = _mm_mask3_fmsub_ps(a, b, c, 0b00001111);
        let e = _mm_set_ps(-1., 0., 1., 2.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_fmaddsub_ps() {
        let a = _mm512_set1_ps(1.);
        let b = _mm512_setr_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        let c = _mm512_set1_ps(1.);
        let r = _mm512_fmaddsub_ps(a, b, c);
        let e = _mm512_setr_ps(
            -1., 2., 1., 4., 3., 6., 5., 8., 7., 10., 9., 12., 11., 14., 13., 16.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_fmaddsub_ps() {
        let a = _mm512_set1_ps(1.);
        let b = _mm512_setr_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        let c = _mm512_set1_ps(1.);
        let r = _mm512_mask_fmaddsub_ps(a, 0, b, c);
        assert_eq_m512(r, a);
        let r = _mm512_mask_fmaddsub_ps(a, 0b00000000_11111111, b, c);
        let e = _mm512_setr_ps(
            -1., 2., 1., 4., 3., 6., 5., 8., 1., 1., 1., 1., 1., 1., 1., 1.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_fmaddsub_ps() {
        let a = _mm512_set1_ps(1.);
        let b = _mm512_setr_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        let c = _mm512_set1_ps(1.);
        let r = _mm512_maskz_fmaddsub_ps(0, a, b, c);
        assert_eq_m512(r, _mm512_setzero_ps());
        let r = _mm512_maskz_fmaddsub_ps(0b00000000_11111111, a, b, c);
        let e = _mm512_setr_ps(
            -1., 2., 1., 4., 3., 6., 5., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask3_fmaddsub_ps() {
        let a = _mm512_set1_ps(1.);
        let b = _mm512_setr_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        let c = _mm512_setr_ps(
            1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2.,
        );
        let r = _mm512_mask3_fmaddsub_ps(a, b, c, 0);
        assert_eq_m512(r, c);
        let r = _mm512_mask3_fmaddsub_ps(a, b, c, 0b00000000_11111111);
        let e = _mm512_setr_ps(
            -1., 2., 1., 4., 3., 6., 5., 8., 2., 2., 2., 2., 2., 2., 2., 2.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_fmaddsub_ps() {
        let a = _mm256_set1_ps(1.);
        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
        let c = _mm256_set1_ps(1.);
        let r = _mm256_mask_fmaddsub_ps(a, 0, b, c);
        assert_eq_m256(r, a);
        let r = _mm256_mask_fmaddsub_ps(a, 0b11111111, b, c);
        let e = _mm256_set_ps(1., 0., 3., 2., 5., 4., 7., 6.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_fmaddsub_ps() {
        let a = _mm256_set1_ps(1.);
        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
        let c = _mm256_set1_ps(1.);
        let r = _mm256_maskz_fmaddsub_ps(0, a, b, c);
        assert_eq_m256(r, _mm256_setzero_ps());
        let r = _mm256_maskz_fmaddsub_ps(0b11111111, a, b, c);
        let e = _mm256_set_ps(1., 0., 3., 2., 5., 4., 7., 6.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask3_fmaddsub_ps() {
        let a = _mm256_set1_ps(1.);
        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
        let c = _mm256_set1_ps(1.);
        let r = _mm256_mask3_fmaddsub_ps(a, b, c, 0);
        assert_eq_m256(r, c);
        let r = _mm256_mask3_fmaddsub_ps(a, b, c, 0b11111111);
        let e = _mm256_set_ps(1., 0., 3., 2., 5., 4., 7., 6.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_fmaddsub_ps() {
        let a = _mm_set1_ps(1.);
        let b = _mm_set_ps(0., 1., 2., 3.);
        let c = _mm_set1_ps(1.);
        let r = _mm_mask_fmaddsub_ps(a, 0, b, c);
        assert_eq_m128(r, a);
        let r = _mm_mask_fmaddsub_ps(a, 0b00001111, b, c);
        let e = _mm_set_ps(1., 0., 3., 2.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_fmaddsub_ps() {
        let a = _mm_set1_ps(1.);
        let b = _mm_set_ps(0., 1., 2., 3.);
        let c = _mm_set1_ps(1.);
        let r = _mm_maskz_fmaddsub_ps(0, a, b, c);
        assert_eq_m128(r, _mm_setzero_ps());
        let r = _mm_maskz_fmaddsub_ps(0b00001111, a, b, c);
        let e = _mm_set_ps(1., 0., 3., 2.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask3_fmaddsub_ps() {
        let a = _mm_set1_ps(1.);
        let b = _mm_set_ps(0., 1., 2., 3.);
        let c = _mm_set1_ps(1.);
        let r = _mm_mask3_fmaddsub_ps(a, b, c, 0);
        assert_eq_m128(r, c);
        let r = _mm_mask3_fmaddsub_ps(a, b, c, 0b00001111);
        let e = _mm_set_ps(1., 0., 3., 2.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_fmsubadd_ps() {
        let a = _mm512_setr_ps(
            1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        );
        let b = _mm512_setr_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        let c = _mm512_setr_ps(
            1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        );
        let r = _mm512_fmsubadd_ps(a, b, c);
        let e = _mm512_setr_ps(
            1., 0., 3., 2., 5., 4., 7., 6., 9., 8., 11., 10., 13., 12., 15., 14.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_fmsubadd_ps() {
        let a = _mm512_set1_ps(1.);
        let b = _mm512_setr_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        let c = _mm512_set1_ps(1.);
        let r = _mm512_mask_fmsubadd_ps(a, 0, b, c);
        assert_eq_m512(r, a);
        let r = _mm512_mask_fmsubadd_ps(a, 0b00000000_11111111, b, c);
        let e = _mm512_setr_ps(
            1., 0., 3., 2., 5., 4., 7., 6., 1., 1., 1., 1., 1., 1., 1., 1.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_fmsubadd_ps() {
        let a = _mm512_set1_ps(1.);
        let b = _mm512_setr_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        let c = _mm512_set1_ps(1.);
        let r = _mm512_maskz_fmsubadd_ps(0, a, b, c);
        assert_eq_m512(r, _mm512_setzero_ps());
        let r = _mm512_maskz_fmsubadd_ps(0b00000000_11111111, a, b, c);
        let e = _mm512_setr_ps(
            1., 0., 3., 2., 5., 4., 7., 6., 0., 0., 0., 0., 0., 0., 0., 0.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask3_fmsubadd_ps() {
        let a = _mm512_set1_ps(1.);
        let b = _mm512_setr_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        let c = _mm512_setr_ps(
            1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2.,
        );
        let r = _mm512_mask3_fmsubadd_ps(a, b, c, 0);
        assert_eq_m512(r, c);
        let r = _mm512_mask3_fmsubadd_ps(a, b, c, 0b00000000_11111111);
        let e = _mm512_setr_ps(
            1., 0., 3., 2., 5., 4., 7., 6., 2., 2., 2., 2., 2., 2., 2., 2.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_fmsubadd_ps() {
        let a = _mm256_set1_ps(1.);
        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
        let c = _mm256_set1_ps(1.);
        let r = _mm256_mask_fmsubadd_ps(a, 0, b, c);
        assert_eq_m256(r, a);
        let r = _mm256_mask_fmsubadd_ps(a, 0b11111111, b, c);
        let e = _mm256_set_ps(-1., 2., 1., 4., 3., 6., 5., 8.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_fmsubadd_ps() {
        let a = _mm256_set1_ps(1.);
        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
        let c = _mm256_set1_ps(1.);
        let r = _mm256_maskz_fmsubadd_ps(0, a, b, c);
        assert_eq_m256(r, _mm256_setzero_ps());
        let r = _mm256_maskz_fmsubadd_ps(0b11111111, a, b, c);
        let e = _mm256_set_ps(-1., 2., 1., 4., 3., 6., 5., 8.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask3_fmsubadd_ps() {
        let a = _mm256_set1_ps(1.);
        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
        let c = _mm256_set1_ps(1.);
        let r = _mm256_mask3_fmsubadd_ps(a, b, c, 0);
        assert_eq_m256(r, c);
        let r = _mm256_mask3_fmsubadd_ps(a, b, c, 0b11111111);
        let e = _mm256_set_ps(-1., 2., 1., 4., 3., 6., 5., 8.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_fmsubadd_ps() {
        let a = _mm_set1_ps(1.);
        let b = _mm_set_ps(0., 1., 2., 3.);
        let c = _mm_set1_ps(1.);
        let r = _mm_mask_fmsubadd_ps(a, 0, b, c);
        assert_eq_m128(r, a);
        let r = _mm_mask_fmsubadd_ps(a, 0b00001111, b, c);
        let e = _mm_set_ps(-1., 2., 1., 4.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_fmsubadd_ps() {
        let a = _mm_set1_ps(1.);
        let b = _mm_set_ps(0., 1., 2., 3.);
        let c = _mm_set1_ps(1.);
        let r = _mm_maskz_fmsubadd_ps(0, a, b, c);
        assert_eq_m128(r, _mm_setzero_ps());
        let r = _mm_maskz_fmsubadd_ps(0b00001111, a, b, c);
        let e = _mm_set_ps(-1., 2., 1., 4.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask3_fmsubadd_ps() {
        let a = _mm_set1_ps(1.);
        let b = _mm_set_ps(0., 1., 2., 3.);
        let c = _mm_set1_ps(1.);
        let r = _mm_mask3_fmsubadd_ps(a, b, c, 0);
        assert_eq_m128(r, c);
        let r = _mm_mask3_fmsubadd_ps(a, b, c, 0b00001111);
        let e = _mm_set_ps(-1., 2., 1., 4.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_fnmadd_ps() {
        let a = _mm512_set1_ps(1.);
        let b = _mm512_setr_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        let c = _mm512_set1_ps(1.);
        let r = _mm512_fnmadd_ps(a, b, c);
        let e = _mm512_setr_ps(
            1., 0., -1., -2., -3., -4., -5., -6., -7., -8., -9., -10., -11., -12., -13., -14.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_fnmadd_ps() {
        let a = _mm512_set1_ps(1.);
        let b = _mm512_setr_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        let c = _mm512_set1_ps(1.);
        let r = _mm512_mask_fnmadd_ps(a, 0, b, c);
        assert_eq_m512(r, a);
        let r = _mm512_mask_fnmadd_ps(a, 0b00000000_11111111, b, c);
        let e = _mm512_setr_ps(
            1., 0., -1., -2., -3., -4., -5., -6., 1., 1., 1., 1., 1., 1., 1., 1.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_fnmadd_ps() {
        let a = _mm512_set1_ps(1.);
        let b = _mm512_setr_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        let c = _mm512_set1_ps(1.);
        let r = _mm512_maskz_fnmadd_ps(0, a, b, c);
        assert_eq_m512(r, _mm512_setzero_ps());
        let r = _mm512_maskz_fnmadd_ps(0b00000000_11111111, a, b, c);
        let e = _mm512_setr_ps(
            1., 0., -1., -2., -3., -4., -5., -6., 0., 0., 0., 0., 0., 0., 0., 0.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask3_fnmadd_ps() {
        let a = _mm512_set1_ps(1.);
        let b = _mm512_setr_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        let c = _mm512_setr_ps(
            1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2.,
        );
        let r = _mm512_mask3_fnmadd_ps(a, b, c, 0);
        assert_eq_m512(r, c);
        let r = _mm512_mask3_fnmadd_ps(a, b, c, 0b00000000_11111111);
        let e = _mm512_setr_ps(
            1., 0., -1., -2., -3., -4., -5., -6., 2., 2., 2., 2., 2., 2., 2., 2.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_fnmadd_ps() {
        let a = _mm256_set1_ps(1.);
        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
        let c = _mm256_set1_ps(1.);
        let r = _mm256_mask_fnmadd_ps(a, 0, b, c);
        assert_eq_m256(r, a);
        let r = _mm256_mask_fnmadd_ps(a, 0b11111111, b, c);
        let e = _mm256_set_ps(1., 0., -1., -2., -3., -4., -5., -6.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_fnmadd_ps() {
        let a = _mm256_set1_ps(1.);
        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
        let c = _mm256_set1_ps(1.);
        let r = _mm256_maskz_fnmadd_ps(0, a, b, c);
        assert_eq_m256(r, _mm256_setzero_ps());
        let r = _mm256_maskz_fnmadd_ps(0b11111111, a, b, c);
        let e = _mm256_set_ps(1., 0., -1., -2., -3., -4., -5., -6.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask3_fnmadd_ps() {
        let a = _mm256_set1_ps(1.);
        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
        let c = _mm256_set1_ps(1.);
        let r = _mm256_mask3_fnmadd_ps(a, b, c, 0);
        assert_eq_m256(r, c);
        let r = _mm256_mask3_fnmadd_ps(a, b, c, 0b11111111);
        let e = _mm256_set_ps(1., 0., -1., -2., -3., -4., -5., -6.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_fnmadd_ps() {
        let a = _mm_set1_ps(1.);
        let b = _mm_set_ps(0., 1., 2., 3.);
        let c = _mm_set1_ps(1.);
        let r = _mm_mask_fnmadd_ps(a, 0, b, c);
        assert_eq_m128(r, a);
        let r = _mm_mask_fnmadd_ps(a, 0b00001111, b, c);
        let e = _mm_set_ps(1., 0., -1., -2.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_fnmadd_ps() {
        let a = _mm_set1_ps(1.);
        let b = _mm_set_ps(0., 1., 2., 3.);
        let c = _mm_set1_ps(1.);
        let r = _mm_maskz_fnmadd_ps(0, a, b, c);
        assert_eq_m128(r, _mm_setzero_ps());
        let r = _mm_maskz_fnmadd_ps(0b00001111, a, b, c);
        let e = _mm_set_ps(1., 0., -1., -2.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask3_fnmadd_ps() {
        let a = _mm_set1_ps(1.);
        let b = _mm_set_ps(0., 1., 2., 3.);
        let c = _mm_set1_ps(1.);
        let r = _mm_mask3_fnmadd_ps(a, b, c, 0);
        assert_eq_m128(r, c);
        let r = _mm_mask3_fnmadd_ps(a, b, c, 0b00001111);
        let e = _mm_set_ps(1., 0., -1., -2.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_fnmsub_ps() {
        let a = _mm512_set1_ps(1.);
        let b = _mm512_setr_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        let c = _mm512_set1_ps(1.);
        let r = _mm512_fnmsub_ps(a, b, c);
        let e = _mm512_setr_ps(
            -1., -2., -3., -4., -5., -6., -7., -8., -9., -10., -11., -12., -13., -14., -15., -16.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_fnmsub_ps() {
        let a = _mm512_set1_ps(1.);
        let b = _mm512_setr_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        let c = _mm512_set1_ps(1.);
        let r = _mm512_mask_fnmsub_ps(a, 0, b, c);
        assert_eq_m512(r, a);
        let r = _mm512_mask_fnmsub_ps(a, 0b00000000_11111111, b, c);
        let e = _mm512_setr_ps(
            -1., -2., -3., -4., -5., -6., -7., -8., 1., 1., 1., 1., 1., 1., 1., 1.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_fnmsub_ps() {
        let a = _mm512_set1_ps(1.);
        let b = _mm512_setr_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        let c = _mm512_set1_ps(1.);
        let r = _mm512_maskz_fnmsub_ps(0, a, b, c);
        assert_eq_m512(r, _mm512_setzero_ps());
        let r = _mm512_maskz_fnmsub_ps(0b00000000_11111111, a, b, c);
        let e = _mm512_setr_ps(
            -1., -2., -3., -4., -5., -6., -7., -8., 0., 0., 0., 0., 0., 0., 0., 0.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask3_fnmsub_ps() {
        let a = _mm512_set1_ps(1.);
        let b = _mm512_setr_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        let c = _mm512_setr_ps(
            1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2.,
        );
        let r = _mm512_mask3_fnmsub_ps(a, b, c, 0);
        assert_eq_m512(r, c);
        let r = _mm512_mask3_fnmsub_ps(a, b, c, 0b00000000_11111111);
        let e = _mm512_setr_ps(
            -1., -2., -3., -4., -5., -6., -7., -8., 2., 2., 2., 2., 2., 2., 2., 2.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_fnmsub_ps() {
        let a = _mm256_set1_ps(1.);
        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
        let c = _mm256_set1_ps(1.);
        let r = _mm256_mask_fnmsub_ps(a, 0, b, c);
        assert_eq_m256(r, a);
        let r = _mm256_mask_fnmsub_ps(a, 0b11111111, b, c);
        let e = _mm256_set_ps(-1., -2., -3., -4., -5., -6., -7., -8.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_fnmsub_ps() {
        let a = _mm256_set1_ps(1.);
        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
        let c = _mm256_set1_ps(1.);
        let r = _mm256_maskz_fnmsub_ps(0, a, b, c);
        assert_eq_m256(r, _mm256_setzero_ps());
        let r = _mm256_maskz_fnmsub_ps(0b11111111, a, b, c);
        let e = _mm256_set_ps(-1., -2., -3., -4., -5., -6., -7., -8.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask3_fnmsub_ps() {
        let a = _mm256_set1_ps(1.);
        let b = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
        let c = _mm256_set1_ps(1.);
        let r = _mm256_mask3_fnmsub_ps(a, b, c, 0);
        assert_eq_m256(r, c);
        let r = _mm256_mask3_fnmsub_ps(a, b, c, 0b11111111);
        let e = _mm256_set_ps(-1., -2., -3., -4., -5., -6., -7., -8.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_fnmsub_ps() {
        let a = _mm_set1_ps(1.);
        let b = _mm_set_ps(0., 1., 2., 3.);
        let c = _mm_set1_ps(1.);
        let r = _mm_mask_fnmsub_ps(a, 0, b, c);
        assert_eq_m128(r, a);
        let r = _mm_mask_fnmsub_ps(a, 0b00001111, b, c);
        let e = _mm_set_ps(-1., -2., -3., -4.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_fnmsub_ps() {
        let a = _mm_set1_ps(1.);
        let b = _mm_set_ps(0., 1., 2., 3.);
        let c = _mm_set1_ps(1.);
        let r = _mm_maskz_fnmsub_ps(0, a, b, c);
        assert_eq_m128(r, _mm_setzero_ps());
        let r = _mm_maskz_fnmsub_ps(0b00001111, a, b, c);
        let e = _mm_set_ps(-1., -2., -3., -4.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask3_fnmsub_ps() {
        let a = _mm_set1_ps(1.);
        let b = _mm_set_ps(0., 1., 2., 3.);
        let c = _mm_set1_ps(1.);
        let r = _mm_mask3_fnmsub_ps(a, b, c, 0);
        assert_eq_m128(r, c);
        let r = _mm_mask3_fnmsub_ps(a, b, c, 0b00001111);
        let e = _mm_set_ps(-1., -2., -3., -4.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_rcp14_ps() {
        let a = _mm512_set1_ps(3.);
        let r = _mm512_rcp14_ps(a);
        let e = _mm512_set1_ps(0.33333206);
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_rcp14_ps() {
        let a = _mm512_set1_ps(3.);
        let r = _mm512_mask_rcp14_ps(a, 0, a);
        assert_eq_m512(r, a);
        let r = _mm512_mask_rcp14_ps(a, 0b11111111_00000000, a);
        let e = _mm512_setr_ps(
            3., 3., 3., 3., 3., 3., 3., 3., 0.33333206, 0.33333206, 0.33333206, 0.33333206,
            0.33333206, 0.33333206, 0.33333206, 0.33333206,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_rcp14_ps() {
        let a = _mm512_set1_ps(3.);
        let r = _mm512_maskz_rcp14_ps(0, a);
        assert_eq_m512(r, _mm512_setzero_ps());
        let r = _mm512_maskz_rcp14_ps(0b11111111_00000000, a);
        let e = _mm512_setr_ps(
            0., 0., 0., 0., 0., 0., 0., 0., 0.33333206, 0.33333206, 0.33333206, 0.33333206,
            0.33333206, 0.33333206, 0.33333206, 0.33333206,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_rcp14_ps() {
        let a = _mm256_set1_ps(3.);
        let r = _mm256_rcp14_ps(a);
        let e = _mm256_set1_ps(0.33333206);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_rcp14_ps() {
        let a = _mm256_set1_ps(3.);
        let r = _mm256_mask_rcp14_ps(a, 0, a);
        assert_eq_m256(r, a);
        let r = _mm256_mask_rcp14_ps(a, 0b11111111, a);
        let e = _mm256_set1_ps(0.33333206);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_rcp14_ps() {
        let a = _mm256_set1_ps(3.);
        let r = _mm256_maskz_rcp14_ps(0, a);
        assert_eq_m256(r, _mm256_setzero_ps());
        let r = _mm256_maskz_rcp14_ps(0b11111111, a);
        let e = _mm256_set1_ps(0.33333206);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_rcp14_ps() {
        let a = _mm_set1_ps(3.);
        let r = _mm_rcp14_ps(a);
        let e = _mm_set1_ps(0.33333206);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_rcp14_ps() {
        let a = _mm_set1_ps(3.);
        let r = _mm_mask_rcp14_ps(a, 0, a);
        assert_eq_m128(r, a);
        let r = _mm_mask_rcp14_ps(a, 0b00001111, a);
        let e = _mm_set1_ps(0.33333206);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_rcp14_ps() {
        let a = _mm_set1_ps(3.);
        let r = _mm_maskz_rcp14_ps(0, a);
        assert_eq_m128(r, _mm_setzero_ps());
        let r = _mm_maskz_rcp14_ps(0b00001111, a);
        let e = _mm_set1_ps(0.33333206);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_rsqrt14_ps() {
        let a = _mm512_set1_ps(3.);
        let r = _mm512_rsqrt14_ps(a);
        let e = _mm512_set1_ps(0.5773392);
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_rsqrt14_ps() {
        let a = _mm512_set1_ps(3.);
        let r = _mm512_mask_rsqrt14_ps(a, 0, a);
        assert_eq_m512(r, a);
        let r = _mm512_mask_rsqrt14_ps(a, 0b11111111_00000000, a);
        let e = _mm512_setr_ps(
            3., 3., 3., 3., 3., 3., 3., 3., 0.5773392, 0.5773392, 0.5773392, 0.5773392, 0.5773392,
            0.5773392, 0.5773392, 0.5773392,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_rsqrt14_ps() {
        let a = _mm512_set1_ps(3.);
        let r = _mm512_maskz_rsqrt14_ps(0, a);
        assert_eq_m512(r, _mm512_setzero_ps());
        let r = _mm512_maskz_rsqrt14_ps(0b11111111_00000000, a);
        let e = _mm512_setr_ps(
            0., 0., 0., 0., 0., 0., 0., 0., 0.5773392, 0.5773392, 0.5773392, 0.5773392, 0.5773392,
            0.5773392, 0.5773392, 0.5773392,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_rsqrt14_ps() {
        let a = _mm256_set1_ps(3.);
        let r = _mm256_mask_rsqrt14_ps(a, 0, a);
        assert_eq_m256(r, a);
        let r = _mm256_mask_rsqrt14_ps(a, 0b11111111, a);
        let e = _mm256_set1_ps(0.5773392);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_rsqrt14_ps() {
        let a = _mm256_set1_ps(3.);
        let r = _mm256_maskz_rsqrt14_ps(0, a);
        assert_eq_m256(r, _mm256_setzero_ps());
        let r = _mm256_maskz_rsqrt14_ps(0b11111111, a);
        let e = _mm256_set1_ps(0.5773392);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_rsqrt14_ps() {
        let a = _mm_set1_ps(3.);
        let r = _mm_mask_rsqrt14_ps(a, 0, a);
        assert_eq_m128(r, a);
        let r = _mm_mask_rsqrt14_ps(a, 0b00001111, a);
        let e = _mm_set1_ps(0.5773392);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_rsqrt14_ps() {
        let a = _mm_set1_ps(3.);
        let r = _mm_maskz_rsqrt14_ps(0, a);
        assert_eq_m128(r, _mm_setzero_ps());
        let r = _mm_maskz_rsqrt14_ps(0b00001111, a);
        let e = _mm_set1_ps(0.5773392);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_getexp_ps() {
        let a = _mm512_set1_ps(3.);
        let r = _mm512_getexp_ps(a);
        let e = _mm512_set1_ps(1.);
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_getexp_ps() {
        let a = _mm512_set1_ps(3.);
        let r = _mm512_mask_getexp_ps(a, 0, a);
        assert_eq_m512(r, a);
        let r = _mm512_mask_getexp_ps(a, 0b11111111_00000000, a);
        let e = _mm512_setr_ps(
            3., 3., 3., 3., 3., 3., 3., 3., 1., 1., 1., 1., 1., 1., 1., 1.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_getexp_ps() {
        let a = _mm512_set1_ps(3.);
        let r = _mm512_maskz_getexp_ps(0, a);
        assert_eq_m512(r, _mm512_setzero_ps());
        let r = _mm512_maskz_getexp_ps(0b11111111_00000000, a);
        let e = _mm512_setr_ps(
            0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_getexp_ps() {
        let a = _mm256_set1_ps(3.);
        let r = _mm256_getexp_ps(a);
        let e = _mm256_set1_ps(1.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_getexp_ps() {
        let a = _mm256_set1_ps(3.);
        let r = _mm256_mask_getexp_ps(a, 0, a);
        assert_eq_m256(r, a);
        let r = _mm256_mask_getexp_ps(a, 0b11111111, a);
        let e = _mm256_set1_ps(1.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_getexp_ps() {
        let a = _mm256_set1_ps(3.);
        let r = _mm256_maskz_getexp_ps(0, a);
        assert_eq_m256(r, _mm256_setzero_ps());
        let r = _mm256_maskz_getexp_ps(0b11111111, a);
        let e = _mm256_set1_ps(1.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_getexp_ps() {
        let a = _mm_set1_ps(3.);
        let r = _mm_getexp_ps(a);
        let e = _mm_set1_ps(1.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_getexp_ps() {
        let a = _mm_set1_ps(3.);
        let r = _mm_mask_getexp_ps(a, 0, a);
        assert_eq_m128(r, a);
        let r = _mm_mask_getexp_ps(a, 0b00001111, a);
        let e = _mm_set1_ps(1.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_getexp_ps() {
        let a = _mm_set1_ps(3.);
        let r = _mm_maskz_getexp_ps(0, a);
        assert_eq_m128(r, _mm_setzero_ps());
        let r = _mm_maskz_getexp_ps(0b00001111, a);
        let e = _mm_set1_ps(1.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_roundscale_ps() {
        let a = _mm512_set1_ps(1.1);
        let r = _mm512_roundscale_ps::<0b00_00_00_00>(a);
        let e = _mm512_set1_ps(1.0);
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_roundscale_ps() {
        let a = _mm512_set1_ps(1.1);
        let r = _mm512_mask_roundscale_ps::<0b00_00_00_00>(a, 0, a);
        let e = _mm512_set1_ps(1.1);
        assert_eq_m512(r, e);
        let r = _mm512_mask_roundscale_ps::<0b00_00_00_00>(a, 0b11111111_11111111, a);
        let e = _mm512_set1_ps(1.0);
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_roundscale_ps() {
        let a = _mm512_set1_ps(1.1);
        let r = _mm512_maskz_roundscale_ps::<0b00_00_00_00>(0, a);
        assert_eq_m512(r, _mm512_setzero_ps());
        let r = _mm512_maskz_roundscale_ps::<0b00_00_00_00>(0b11111111_11111111, a);
        let e = _mm512_set1_ps(1.0);
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_roundscale_ps() {
        let a = _mm256_set1_ps(1.1);
        let r = _mm256_roundscale_ps::<0b00_00_00_00>(a);
        let e = _mm256_set1_ps(1.0);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_roundscale_ps() {
        let a = _mm256_set1_ps(1.1);
        let r = _mm256_mask_roundscale_ps::<0b00_00_00_00>(a, 0, a);
        let e = _mm256_set1_ps(1.1);
        assert_eq_m256(r, e);
        let r = _mm256_mask_roundscale_ps::<0b00_00_00_00>(a, 0b11111111, a);
        let e = _mm256_set1_ps(1.0);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_roundscale_ps() {
        let a = _mm256_set1_ps(1.1);
        let r = _mm256_maskz_roundscale_ps::<0b00_00_00_00>(0, a);
        assert_eq_m256(r, _mm256_setzero_ps());
        let r = _mm256_maskz_roundscale_ps::<0b00_00_00_00>(0b11111111, a);
        let e = _mm256_set1_ps(1.0);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_roundscale_ps() {
        let a = _mm_set1_ps(1.1);
        let r = _mm_roundscale_ps::<0b00_00_00_00>(a);
        let e = _mm_set1_ps(1.0);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_roundscale_ps() {
        let a = _mm_set1_ps(1.1);
        let r = _mm_mask_roundscale_ps::<0b00_00_00_00>(a, 0, a);
        let e = _mm_set1_ps(1.1);
        assert_eq_m128(r, e);
        let r = _mm_mask_roundscale_ps::<0b00_00_00_00>(a, 0b00001111, a);
        let e = _mm_set1_ps(1.0);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_roundscale_ps() {
        let a = _mm_set1_ps(1.1);
        let r = _mm_maskz_roundscale_ps::<0b00_00_00_00>(0, a);
        assert_eq_m128(r, _mm_setzero_ps());
        let r = _mm_maskz_roundscale_ps::<0b00_00_00_00>(0b00001111, a);
        let e = _mm_set1_ps(1.0);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_scalef_ps() {
        let a = _mm512_set1_ps(1.);
        let b = _mm512_set1_ps(3.);
        let r = _mm512_scalef_ps(a, b);
        let e = _mm512_set1_ps(8.);
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_scalef_ps() {
        let a = _mm512_set1_ps(1.);
        let b = _mm512_set1_ps(3.);
        let r = _mm512_mask_scalef_ps(a, 0, a, b);
        assert_eq_m512(r, a);
        let r = _mm512_mask_scalef_ps(a, 0b11111111_00000000, a, b);
        let e = _mm512_set_ps(
            8., 8., 8., 8., 8., 8., 8., 8., 1., 1., 1., 1., 1., 1., 1., 1.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_scalef_ps() {
        let a = _mm512_set1_ps(1.);
        let b = _mm512_set1_ps(3.);
        let r = _mm512_maskz_scalef_ps(0, a, b);
        assert_eq_m512(r, _mm512_setzero_ps());
        let r = _mm512_maskz_scalef_ps(0b11111111_00000000, a, b);
        let e = _mm512_set_ps(
            8., 8., 8., 8., 8., 8., 8., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_scalef_ps() {
        let a = _mm256_set1_ps(1.);
        let b = _mm256_set1_ps(3.);
        let r = _mm256_scalef_ps(a, b);
        let e = _mm256_set1_ps(8.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_scalef_ps() {
        let a = _mm256_set1_ps(1.);
        let b = _mm256_set1_ps(3.);
        let r = _mm256_mask_scalef_ps(a, 0, a, b);
        assert_eq_m256(r, a);
        let r = _mm256_mask_scalef_ps(a, 0b11111111, a, b);
        let e = _mm256_set1_ps(8.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_scalef_ps() {
        let a = _mm256_set1_ps(1.);
        let b = _mm256_set1_ps(3.);
        let r = _mm256_maskz_scalef_ps(0, a, b);
        assert_eq_m256(r, _mm256_setzero_ps());
        let r = _mm256_maskz_scalef_ps(0b11111111, a, b);
        let e = _mm256_set1_ps(8.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_scalef_ps() {
        let a = _mm_set1_ps(1.);
        let b = _mm_set1_ps(3.);
        let r = _mm_scalef_ps(a, b);
        let e = _mm_set1_ps(8.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_scalef_ps() {
        let a = _mm_set1_ps(1.);
        let b = _mm_set1_ps(3.);
        let r = _mm_mask_scalef_ps(a, 0, a, b);
        assert_eq_m128(r, a);
        let r = _mm_mask_scalef_ps(a, 0b00001111, a, b);
        let e = _mm_set1_ps(8.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_scalef_ps() {
        let a = _mm_set1_ps(1.);
        let b = _mm_set1_ps(3.);
        let r = _mm_maskz_scalef_ps(0, a, b);
        assert_eq_m128(r, _mm_setzero_ps());
        let r = _mm_maskz_scalef_ps(0b00001111, a, b);
        let e = _mm_set1_ps(8.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_fixupimm_ps() {
        let a = _mm512_set1_ps(f32::NAN);
        let b = _mm512_set1_ps(f32::MAX);
        let c = _mm512_set1_epi32(i32::MAX);
        //let r = _mm512_fixupimm_ps(a, b, c, 5);
        let r = _mm512_fixupimm_ps::<5>(a, b, c);
        let e = _mm512_set1_ps(0.0);
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_fixupimm_ps() {
        #[rustfmt::skip]
        let a = _mm512_set_ps(
            f32::NAN, f32::NAN, f32::NAN, f32::NAN,
            f32::NAN, f32::NAN, f32::NAN, f32::NAN,
            1., 1., 1., 1.,
            1., 1., 1., 1.,
        );
        let b = _mm512_set1_ps(f32::MAX);
        let c = _mm512_set1_epi32(i32::MAX);
        let r = _mm512_mask_fixupimm_ps::<5>(a, 0b11111111_00000000, b, c);
        let e = _mm512_set_ps(
            0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_fixupimm_ps() {
        #[rustfmt::skip]
        let a = _mm512_set_ps(
            f32::NAN, f32::NAN, f32::NAN, f32::NAN,
            f32::NAN, f32::NAN, f32::NAN, f32::NAN,
            1., 1., 1., 1.,
            1., 1., 1., 1.,
        );
        let b = _mm512_set1_ps(f32::MAX);
        let c = _mm512_set1_epi32(i32::MAX);
        let r = _mm512_maskz_fixupimm_ps::<5>(0b11111111_00000000, a, b, c);
        let e = _mm512_set_ps(
            0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_fixupimm_ps() {
        let a = _mm256_set1_ps(f32::NAN);
        let b = _mm256_set1_ps(f32::MAX);
        let c = _mm256_set1_epi32(i32::MAX);
        let r = _mm256_fixupimm_ps::<5>(a, b, c);
        let e = _mm256_set1_ps(0.0);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_fixupimm_ps() {
        let a = _mm256_set1_ps(f32::NAN);
        let b = _mm256_set1_ps(f32::MAX);
        let c = _mm256_set1_epi32(i32::MAX);
        let r = _mm256_mask_fixupimm_ps::<5>(a, 0b11111111, b, c);
        let e = _mm256_set1_ps(0.0);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_fixupimm_ps() {
        let a = _mm256_set1_ps(f32::NAN);
        let b = _mm256_set1_ps(f32::MAX);
        let c = _mm256_set1_epi32(i32::MAX);
        let r = _mm256_maskz_fixupimm_ps::<5>(0b11111111, a, b, c);
        let e = _mm256_set1_ps(0.0);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_fixupimm_ps() {
        let a = _mm_set1_ps(f32::NAN);
        let b = _mm_set1_ps(f32::MAX);
        let c = _mm_set1_epi32(i32::MAX);
        let r = _mm_fixupimm_ps::<5>(a, b, c);
        let e = _mm_set1_ps(0.0);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_fixupimm_ps() {
        let a = _mm_set1_ps(f32::NAN);
        let b = _mm_set1_ps(f32::MAX);
        let c = _mm_set1_epi32(i32::MAX);
        let r = _mm_mask_fixupimm_ps::<5>(a, 0b00001111, b, c);
        let e = _mm_set1_ps(0.0);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_fixupimm_ps() {
        let a = _mm_set1_ps(f32::NAN);
        let b = _mm_set1_ps(f32::MAX);
        let c = _mm_set1_epi32(i32::MAX);
        let r = _mm_maskz_fixupimm_ps::<5>(0b00001111, a, b, c);
        let e = _mm_set1_ps(0.0);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_ternarylogic_epi32() {
        let a = _mm512_set1_epi32(1 << 2);
        let b = _mm512_set1_epi32(1 << 1);
        let c = _mm512_set1_epi32(1 << 0);
        let r = _mm512_ternarylogic_epi32::<8>(a, b, c);
        let e = _mm512_set1_epi32(0);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_ternarylogic_epi32() {
        let src = _mm512_set1_epi32(1 << 2);
        let a = _mm512_set1_epi32(1 << 1);
        let b = _mm512_set1_epi32(1 << 0);
        let r = _mm512_mask_ternarylogic_epi32::<8>(src, 0, a, b);
        assert_eq_m512i(r, src);
        let r = _mm512_mask_ternarylogic_epi32::<8>(src, 0b11111111_11111111, a, b);
        let e = _mm512_set1_epi32(0);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_ternarylogic_epi32() {
        let a = _mm512_set1_epi32(1 << 2);
        let b = _mm512_set1_epi32(1 << 1);
        let c = _mm512_set1_epi32(1 << 0);
        let r = _mm512_maskz_ternarylogic_epi32::<9>(0, a, b, c);
        assert_eq_m512i(r, _mm512_setzero_si512());
        let r = _mm512_maskz_ternarylogic_epi32::<8>(0b11111111_11111111, a, b, c);
        let e = _mm512_set1_epi32(0);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_ternarylogic_epi32() {
        let a = _mm256_set1_epi32(1 << 2);
        let b = _mm256_set1_epi32(1 << 1);
        let c = _mm256_set1_epi32(1 << 0);
        let r = _mm256_ternarylogic_epi32::<8>(a, b, c);
        let e = _mm256_set1_epi32(0);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_ternarylogic_epi32() {
        let src = _mm256_set1_epi32(1 << 2);
        let a = _mm256_set1_epi32(1 << 1);
        let b = _mm256_set1_epi32(1 << 0);
        let r = _mm256_mask_ternarylogic_epi32::<8>(src, 0, a, b);
        assert_eq_m256i(r, src);
        let r = _mm256_mask_ternarylogic_epi32::<8>(src, 0b11111111, a, b);
        let e = _mm256_set1_epi32(0);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_ternarylogic_epi32() {
        let a = _mm256_set1_epi32(1 << 2);
        let b = _mm256_set1_epi32(1 << 1);
        let c = _mm256_set1_epi32(1 << 0);
        let r = _mm256_maskz_ternarylogic_epi32::<9>(0, a, b, c);
        assert_eq_m256i(r, _mm256_setzero_si256());
        let r = _mm256_maskz_ternarylogic_epi32::<8>(0b11111111, a, b, c);
        let e = _mm256_set1_epi32(0);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_ternarylogic_epi32() {
        let a = _mm_set1_epi32(1 << 2);
        let b = _mm_set1_epi32(1 << 1);
        let c = _mm_set1_epi32(1 << 0);
        let r = _mm_ternarylogic_epi32::<8>(a, b, c);
        let e = _mm_set1_epi32(0);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_ternarylogic_epi32() {
        let src = _mm_set1_epi32(1 << 2);
        let a = _mm_set1_epi32(1 << 1);
        let b = _mm_set1_epi32(1 << 0);
        let r = _mm_mask_ternarylogic_epi32::<8>(src, 0, a, b);
        assert_eq_m128i(r, src);
        let r = _mm_mask_ternarylogic_epi32::<8>(src, 0b00001111, a, b);
        let e = _mm_set1_epi32(0);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_ternarylogic_epi32() {
        let a = _mm_set1_epi32(1 << 2);
        let b = _mm_set1_epi32(1 << 1);
        let c = _mm_set1_epi32(1 << 0);
        let r = _mm_maskz_ternarylogic_epi32::<9>(0, a, b, c);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm_maskz_ternarylogic_epi32::<8>(0b00001111, a, b, c);
        let e = _mm_set1_epi32(0);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_getmant_ps() {
        let a = _mm512_set1_ps(10.);
        let r = _mm512_getmant_ps::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a);
        let e = _mm512_set1_ps(1.25);
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_getmant_ps() {
        let a = _mm512_set1_ps(10.);
        let r = _mm512_mask_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, 0, a);
        assert_eq_m512(r, a);
        let r = _mm512_mask_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(
            a,
            0b11111111_00000000,
            a,
        );
        let e = _mm512_setr_ps(
            10., 10., 10., 10., 10., 10., 10., 10., 1.25, 1.25, 1.25, 1.25, 1.25, 1.25, 1.25, 1.25,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_getmant_ps() {
        let a = _mm512_set1_ps(10.);
        let r = _mm512_maskz_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0, a);
        assert_eq_m512(r, _mm512_setzero_ps());
        let r =
            _mm512_maskz_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0b11111111_00000000, a);
        let e = _mm512_setr_ps(
            0., 0., 0., 0., 0., 0., 0., 0., 1.25, 1.25, 1.25, 1.25, 1.25, 1.25, 1.25, 1.25,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_getmant_ps() {
        let a = _mm256_set1_ps(10.);
        let r = _mm256_getmant_ps::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a);
        let e = _mm256_set1_ps(1.25);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_getmant_ps() {
        let a = _mm256_set1_ps(10.);
        let r = _mm256_mask_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, 0, a);
        assert_eq_m256(r, a);
        let r = _mm256_mask_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, 0b11111111, a);
        let e = _mm256_set1_ps(1.25);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_getmant_ps() {
        let a = _mm256_set1_ps(10.);
        let r = _mm256_maskz_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0, a);
        assert_eq_m256(r, _mm256_setzero_ps());
        let r = _mm256_maskz_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0b11111111, a);
        let e = _mm256_set1_ps(1.25);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_getmant_ps() {
        let a = _mm_set1_ps(10.);
        let r = _mm_getmant_ps::<_MM_MANT_NORM_P75_1P5, _MM_MANT_SIGN_NAN>(a);
        let e = _mm_set1_ps(1.25);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_getmant_ps() {
        let a = _mm_set1_ps(10.);
        let r = _mm_mask_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, 0, a);
        assert_eq_m128(r, a);
        let r = _mm_mask_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, 0b00001111, a);
        let e = _mm_set1_ps(1.25);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_getmant_ps() {
        let a = _mm_set1_ps(10.);
        let r = _mm_maskz_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0, a);
        assert_eq_m128(r, _mm_setzero_ps());
        let r = _mm_maskz_getmant_ps::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0b00001111, a);
        let e = _mm_set1_ps(1.25);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_add_round_ps() {
        let a = _mm512_setr_ps(
            0., 1.5, 2., 3.5, 4., 5.5, 6., 7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 0.00000007,
        );
        let b = _mm512_set1_ps(-1.);
        let r = _mm512_add_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
        #[rustfmt::skip]
        let e = _mm512_setr_ps(
            -1., 0.5, 1., 2.5,
            3., 4.5, 5., 6.5,
            7., 8.5, 9., 10.5,
            11., 12.5, 13., -0.99999994,
        );
        assert_eq_m512(r, e);
        let r = _mm512_add_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
        let e = _mm512_setr_ps(
            -1., 0.5, 1., 2.5, 3., 4.5, 5., 6.5, 7., 8.5, 9., 10.5, 11., 12.5, 13., -0.9999999,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_add_round_ps() {
        let a = _mm512_setr_ps(
            0., 1.5, 2., 3.5, 4., 5.5, 6., 7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 0.00000007,
        );
        let b = _mm512_set1_ps(-1.);
        let r = _mm512_mask_add_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, 0, a, b);
        assert_eq_m512(r, a);
        let r = _mm512_mask_add_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a,
            0b11111111_00000000,
            a,
            b,
        );
        #[rustfmt::skip]
        let e = _mm512_setr_ps(
            0., 1.5, 2., 3.5,
            4., 5.5, 6., 7.5,
            7., 8.5, 9., 10.5,
            11., 12.5, 13., -0.99999994,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_add_round_ps() {
        let a = _mm512_setr_ps(
            0., 1.5, 2., 3.5, 4., 5.5, 6., 7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 0.00000007,
        );
        let b = _mm512_set1_ps(-1.);
        let r = _mm512_maskz_add_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
        assert_eq_m512(r, _mm512_setzero_ps());
        let r = _mm512_maskz_add_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            0b11111111_00000000,
            a,
            b,
        );
        #[rustfmt::skip]
        let e = _mm512_setr_ps(
            0., 0., 0., 0.,
            0., 0., 0., 0.,
            7., 8.5, 9., 10.5,
            11., 12.5, 13., -0.99999994,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_sub_round_ps() {
        let a = _mm512_setr_ps(
            0., 1.5, 2., 3.5, 4., 5.5, 6., 7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 0.00000007,
        );
        let b = _mm512_set1_ps(1.);
        let r = _mm512_sub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
        #[rustfmt::skip]
        let e = _mm512_setr_ps(
            -1., 0.5, 1., 2.5,
            3., 4.5, 5., 6.5,
            7., 8.5, 9., 10.5,
            11., 12.5, 13., -0.99999994,
        );
        assert_eq_m512(r, e);
        let r = _mm512_sub_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
        let e = _mm512_setr_ps(
            -1., 0.5, 1., 2.5, 3., 4.5, 5., 6.5, 7., 8.5, 9., 10.5, 11., 12.5, 13., -0.9999999,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_sub_round_ps() {
        let a = _mm512_setr_ps(
            0., 1.5, 2., 3.5, 4., 5.5, 6., 7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 0.00000007,
        );
        let b = _mm512_set1_ps(1.);
        let r = _mm512_mask_sub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a, 0, a, b,
        );
        assert_eq_m512(r, a);
        let r = _mm512_mask_sub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a,
            0b11111111_00000000,
            a,
            b,
        );
        #[rustfmt::skip]
        let e = _mm512_setr_ps(
            0., 1.5, 2., 3.5,
            4., 5.5, 6., 7.5,
            7., 8.5, 9., 10.5,
            11., 12.5, 13., -0.99999994,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_sub_round_ps() {
        let a = _mm512_setr_ps(
            0., 1.5, 2., 3.5, 4., 5.5, 6., 7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 0.00000007,
        );
        let b = _mm512_set1_ps(1.);
        let r =
            _mm512_maskz_sub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
        assert_eq_m512(r, _mm512_setzero_ps());
        let r = _mm512_maskz_sub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            0b11111111_00000000,
            a,
            b,
        );
        #[rustfmt::skip]
        let e = _mm512_setr_ps(
            0., 0., 0., 0.,
            0., 0., 0., 0.,
            7., 8.5, 9., 10.5,
            11., 12.5, 13., -0.99999994,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mul_round_ps() {
        #[rustfmt::skip]
        let a = _mm512_setr_ps(
            0., 1.5, 2., 3.5,
            4., 5.5, 6., 7.5,
            8., 9.5, 10., 11.5,
            12., 13.5, 14., 0.00000000000000000000007,
        );
        let b = _mm512_set1_ps(0.1);
        let r = _mm512_mul_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
        #[rustfmt::skip]
        let e = _mm512_setr_ps(
            0., 0.15, 0.2, 0.35,
            0.4, 0.55, 0.6, 0.75,
            0.8, 0.95, 1.0, 1.15,
            1.2, 1.35, 1.4, 0.000000000000000000000007000001,
        );
        assert_eq_m512(r, e);
        let r = _mm512_mul_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
        #[rustfmt::skip]
        let e = _mm512_setr_ps(
            0., 0.14999999, 0.2, 0.35,
            0.4, 0.54999995, 0.59999996, 0.75,
            0.8, 0.95, 1.0, 1.15,
            1.1999999, 1.3499999, 1.4, 0.000000000000000000000007,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_mul_round_ps() {
        #[rustfmt::skip]
        let a = _mm512_setr_ps(
            0., 1.5, 2., 3.5,
            4., 5.5, 6., 7.5,
            8., 9.5, 10., 11.5,
            12., 13.5, 14., 0.00000000000000000000007,
        );
        let b = _mm512_set1_ps(0.1);
        let r = _mm512_mask_mul_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a, 0, a, b,
        );
        assert_eq_m512(r, a);
        let r = _mm512_mask_mul_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a,
            0b11111111_00000000,
            a,
            b,
        );
        #[rustfmt::skip]
        let e = _mm512_setr_ps(
            0., 1.5, 2., 3.5,
            4., 5.5, 6., 7.5,
            0.8, 0.95, 1.0, 1.15,
            1.2, 1.35, 1.4, 0.000000000000000000000007000001,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_mul_round_ps() {
        #[rustfmt::skip]
        let a = _mm512_setr_ps(
            0., 1.5, 2., 3.5,
            4., 5.5, 6., 7.5,
            8., 9.5, 10., 11.5,
            12., 13.5, 14., 0.00000000000000000000007,
        );
        let b = _mm512_set1_ps(0.1);
        let r =
            _mm512_maskz_mul_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
        assert_eq_m512(r, _mm512_setzero_ps());
        let r = _mm512_maskz_mul_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            0b11111111_00000000,
            a,
            b,
        );
        #[rustfmt::skip]
        let e = _mm512_setr_ps(
            0., 0., 0., 0.,
            0., 0., 0., 0.,
            0.8, 0.95, 1.0, 1.15,
            1.2, 1.35, 1.4, 0.000000000000000000000007000001,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_div_round_ps() {
        let a = _mm512_set1_ps(1.);
        let b = _mm512_set1_ps(3.);
        let r = _mm512_div_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
        let e = _mm512_set1_ps(0.33333334);
        assert_eq_m512(r, e);
        let r = _mm512_div_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
        let e = _mm512_set1_ps(0.3333333);
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_div_round_ps() {
        let a = _mm512_set1_ps(1.);
        let b = _mm512_set1_ps(3.);
        let r = _mm512_mask_div_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a, 0, a, b,
        );
        assert_eq_m512(r, a);
        let r = _mm512_mask_div_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a,
            0b11111111_00000000,
            a,
            b,
        );
        let e = _mm512_setr_ps(
            1., 1., 1., 1., 1., 1., 1., 1., 0.33333334, 0.33333334, 0.33333334, 0.33333334,
            0.33333334, 0.33333334, 0.33333334, 0.33333334,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_div_round_ps() {
        let a = _mm512_set1_ps(1.);
        let b = _mm512_set1_ps(3.);
        let r =
            _mm512_maskz_div_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
        assert_eq_m512(r, _mm512_setzero_ps());
        let r = _mm512_maskz_div_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            0b11111111_00000000,
            a,
            b,
        );
        let e = _mm512_setr_ps(
            0., 0., 0., 0., 0., 0., 0., 0., 0.33333334, 0.33333334, 0.33333334, 0.33333334,
            0.33333334, 0.33333334, 0.33333334, 0.33333334,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_sqrt_round_ps() {
        let a = _mm512_set1_ps(3.);
        let r = _mm512_sqrt_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
        let e = _mm512_set1_ps(1.7320508);
        assert_eq_m512(r, e);
        let r = _mm512_sqrt_round_ps::<{ _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC }>(a);
        let e = _mm512_set1_ps(1.7320509);
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_sqrt_round_ps() {
        let a = _mm512_set1_ps(3.);
        let r =
            _mm512_mask_sqrt_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, 0, a);
        assert_eq_m512(r, a);
        let r = _mm512_mask_sqrt_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a,
            0b11111111_00000000,
            a,
        );
        let e = _mm512_setr_ps(
            3., 3., 3., 3., 3., 3., 3., 3., 1.7320508, 1.7320508, 1.7320508, 1.7320508, 1.7320508,
            1.7320508, 1.7320508, 1.7320508,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_sqrt_round_ps() {
        let a = _mm512_set1_ps(3.);
        let r =
            _mm512_maskz_sqrt_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a);
        assert_eq_m512(r, _mm512_setzero_ps());
        let r = _mm512_maskz_sqrt_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            0b11111111_00000000,
            a,
        );
        let e = _mm512_setr_ps(
            0., 0., 0., 0., 0., 0., 0., 0., 1.7320508, 1.7320508, 1.7320508, 1.7320508, 1.7320508,
            1.7320508, 1.7320508, 1.7320508,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_fmadd_round_ps() {
        let a = _mm512_set1_ps(0.00000007);
        let b = _mm512_set1_ps(1.);
        let c = _mm512_set1_ps(-1.);
        let r = _mm512_fmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
        let e = _mm512_set1_ps(-0.99999994);
        assert_eq_m512(r, e);
        let r = _mm512_fmadd_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b, c);
        let e = _mm512_set1_ps(-0.9999999);
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_fmadd_round_ps() {
        let a = _mm512_set1_ps(0.00000007);
        let b = _mm512_set1_ps(1.);
        let c = _mm512_set1_ps(-1.);
        let r = _mm512_mask_fmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a, 0, b, c,
        );
        assert_eq_m512(r, a);
        let r = _mm512_mask_fmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a,
            0b00000000_11111111,
            b,
            c,
        );
        #[rustfmt::skip]
        let e = _mm512_setr_ps(
            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
            0.00000007, 0.00000007, 0.00000007, 0.00000007,
            0.00000007, 0.00000007, 0.00000007, 0.00000007,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_fmadd_round_ps() {
        let a = _mm512_set1_ps(0.00000007);
        let b = _mm512_set1_ps(1.);
        let c = _mm512_set1_ps(-1.);
        let r = _mm512_maskz_fmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            0, a, b, c,
        );
        assert_eq_m512(r, _mm512_setzero_ps());
        #[rustfmt::skip]
        let r = _mm512_maskz_fmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            0b00000000_11111111,
            a,
            b,
            c,
        );
        #[rustfmt::skip]
        let e = _mm512_setr_ps(
            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
            0., 0., 0., 0.,
            0., 0., 0., 0.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask3_fmadd_round_ps() {
        let a = _mm512_set1_ps(0.00000007);
        let b = _mm512_set1_ps(1.);
        let c = _mm512_set1_ps(-1.);
        let r = _mm512_mask3_fmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a, b, c, 0,
        );
        assert_eq_m512(r, c);
        let r = _mm512_mask3_fmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a,
            b,
            c,
            0b00000000_11111111,
        );
        #[rustfmt::skip]
        let e = _mm512_setr_ps(
            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
            -1., -1., -1., -1.,
            -1., -1., -1., -1.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_fmsub_round_ps() {
        let a = _mm512_set1_ps(0.00000007);
        let b = _mm512_set1_ps(1.);
        let c = _mm512_set1_ps(1.);
        let r = _mm512_fmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
        let e = _mm512_set1_ps(-0.99999994);
        assert_eq_m512(r, e);
        let r = _mm512_fmsub_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b, c);
        let e = _mm512_set1_ps(-0.9999999);
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_fmsub_round_ps() {
        let a = _mm512_set1_ps(0.00000007);
        let b = _mm512_set1_ps(1.);
        let c = _mm512_set1_ps(1.);
        let r = _mm512_mask_fmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a, 0, b, c,
        );
        assert_eq_m512(r, a);
        let r = _mm512_mask_fmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a,
            0b00000000_11111111,
            b,
            c,
        );
        #[rustfmt::skip]
        let e = _mm512_setr_ps(
            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
            0.00000007, 0.00000007, 0.00000007, 0.00000007,
            0.00000007, 0.00000007, 0.00000007, 0.00000007,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_fmsub_round_ps() {
        let a = _mm512_set1_ps(0.00000007);
        let b = _mm512_set1_ps(1.);
        let c = _mm512_set1_ps(1.);
        let r = _mm512_maskz_fmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            0, a, b, c,
        );
        assert_eq_m512(r, _mm512_setzero_ps());
        let r = _mm512_maskz_fmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            0b00000000_11111111,
            a,
            b,
            c,
        );
        #[rustfmt::skip]
        let e = _mm512_setr_ps(
            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
            0., 0., 0., 0.,
            0., 0., 0., 0.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask3_fmsub_round_ps() {
        let a = _mm512_set1_ps(0.00000007);
        let b = _mm512_set1_ps(1.);
        let c = _mm512_set1_ps(1.);
        let r = _mm512_mask3_fmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a, b, c, 0,
        );
        assert_eq_m512(r, c);
        let r = _mm512_mask3_fmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a,
            b,
            c,
            0b00000000_11111111,
        );
        #[rustfmt::skip]
        let e = _mm512_setr_ps(
            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
            -0.99999994, -0.99999994, -0.99999994, -0.99999994,
            1., 1., 1., 1.,
            1., 1., 1., 1.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_fmaddsub_round_ps() {
        let a = _mm512_set1_ps(0.00000007);
        let b = _mm512_set1_ps(1.);
        let c = _mm512_set1_ps(-1.);
        let r =
            _mm512_fmaddsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
        #[rustfmt::skip]
        let e = _mm512_setr_ps(
            1.0000001, -0.99999994, 1.0000001, -0.99999994,
            1.0000001, -0.99999994, 1.0000001, -0.99999994,
            1.0000001, -0.99999994, 1.0000001, -0.99999994,
            1.0000001, -0.99999994, 1.0000001, -0.99999994,
        );
        assert_eq_m512(r, e);
        let r = _mm512_fmaddsub_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b, c);
        let e = _mm512_setr_ps(
            1., -0.9999999, 1., -0.9999999, 1., -0.9999999, 1., -0.9999999, 1., -0.9999999, 1.,
            -0.9999999, 1., -0.9999999, 1., -0.9999999,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_fmaddsub_round_ps() {
        let a = _mm512_set1_ps(0.00000007);
        let b = _mm512_set1_ps(1.);
        let c = _mm512_set1_ps(-1.);
        let r = _mm512_mask_fmaddsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a, 0, b, c,
        );
        assert_eq_m512(r, a);
        let r = _mm512_mask_fmaddsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a,
            0b00000000_11111111,
            b,
            c,
        );
        #[rustfmt::skip]
        let e = _mm512_setr_ps(
            1.0000001, -0.99999994, 1.0000001, -0.99999994,
            1.0000001, -0.99999994, 1.0000001, -0.99999994,
            0.00000007, 0.00000007, 0.00000007, 0.00000007,
            0.00000007, 0.00000007, 0.00000007, 0.00000007,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_fmaddsub_round_ps() {
        let a = _mm512_set1_ps(0.00000007);
        let b = _mm512_set1_ps(1.);
        let c = _mm512_set1_ps(-1.);
        let r = _mm512_maskz_fmaddsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            0, a, b, c,
        );
        assert_eq_m512(r, _mm512_setzero_ps());
        let r = _mm512_maskz_fmaddsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            0b00000000_11111111,
            a,
            b,
            c,
        );
        #[rustfmt::skip]
        let e = _mm512_setr_ps(
            1.0000001, -0.99999994, 1.0000001, -0.99999994,
            1.0000001, -0.99999994, 1.0000001, -0.99999994,
            0., 0., 0., 0.,
            0., 0., 0., 0.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask3_fmaddsub_round_ps() {
        let a = _mm512_set1_ps(0.00000007);
        let b = _mm512_set1_ps(1.);
        let c = _mm512_set1_ps(-1.);
        let r = _mm512_mask3_fmaddsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a, b, c, 0,
        );
        assert_eq_m512(r, c);
        let r = _mm512_mask3_fmaddsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a,
            b,
            c,
            0b00000000_11111111,
        );
        #[rustfmt::skip]
        let e = _mm512_setr_ps(
            1.0000001, -0.99999994, 1.0000001, -0.99999994,
            1.0000001, -0.99999994, 1.0000001, -0.99999994,
            -1., -1., -1., -1.,
            -1., -1., -1., -1.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_fmsubadd_round_ps() {
        let a = _mm512_set1_ps(0.00000007);
        let b = _mm512_set1_ps(1.);
        let c = _mm512_set1_ps(-1.);
        let r =
            _mm512_fmsubadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
        #[rustfmt::skip]
        let e = _mm512_setr_ps(
            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
        );
        assert_eq_m512(r, e);
        let r = _mm512_fmsubadd_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b, c);
        let e = _mm512_setr_ps(
            -0.9999999, 1., -0.9999999, 1., -0.9999999, 1., -0.9999999, 1., -0.9999999, 1.,
            -0.9999999, 1., -0.9999999, 1., -0.9999999, 1.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_fmsubadd_round_ps() {
        let a = _mm512_set1_ps(0.00000007);
        let b = _mm512_set1_ps(1.);
        let c = _mm512_set1_ps(-1.);
        let r = _mm512_mask_fmsubadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a, 0, b, c,
        );
        assert_eq_m512(r, a);
        let r = _mm512_mask_fmsubadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a,
            0b00000000_11111111,
            b,
            c,
        );
        #[rustfmt::skip]
        let e = _mm512_setr_ps(
            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
            0.00000007, 0.00000007, 0.00000007, 0.00000007,
            0.00000007, 0.00000007, 0.00000007, 0.00000007,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_fmsubadd_round_ps() {
        let a = _mm512_set1_ps(0.00000007);
        let b = _mm512_set1_ps(1.);
        let c = _mm512_set1_ps(-1.);
        let r = _mm512_maskz_fmsubadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            0, a, b, c,
        );
        assert_eq_m512(r, _mm512_setzero_ps());
        let r = _mm512_maskz_fmsubadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            0b00000000_11111111,
            a,
            b,
            c,
        );
        #[rustfmt::skip]
        let e = _mm512_setr_ps(
            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
            0., 0., 0., 0.,
            0., 0., 0., 0.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask3_fmsubadd_round_ps() {
        let a = _mm512_set1_ps(0.00000007);
        let b = _mm512_set1_ps(1.);
        let c = _mm512_set1_ps(-1.);
        let r = _mm512_mask3_fmsubadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a, b, c, 0,
        );
        assert_eq_m512(r, c);
        let r = _mm512_mask3_fmsubadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a,
            b,
            c,
            0b00000000_11111111,
        );
        #[rustfmt::skip]
        let e = _mm512_setr_ps(
            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
            -0.99999994, 1.0000001, -0.99999994, 1.0000001,
            -1., -1., -1., -1.,
            -1., -1., -1., -1.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_fnmadd_round_ps() {
        let a = _mm512_set1_ps(0.00000007);
        let b = _mm512_set1_ps(1.);
        let c = _mm512_set1_ps(1.);
        let r =
            _mm512_fnmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
        let e = _mm512_set1_ps(0.99999994);
        assert_eq_m512(r, e);
        let r = _mm512_fnmadd_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b, c);
        let e = _mm512_set1_ps(0.9999999);
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_fnmadd_round_ps() {
        let a = _mm512_set1_ps(0.00000007);
        let b = _mm512_set1_ps(1.);
        let c = _mm512_set1_ps(1.);
        let r = _mm512_mask_fnmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a, 0, b, c,
        );
        assert_eq_m512(r, a);
        let r = _mm512_mask_fnmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a,
            0b00000000_11111111,
            b,
            c,
        );
        let e = _mm512_setr_ps(
            0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994,
            0.99999994, 0.00000007, 0.00000007, 0.00000007, 0.00000007, 0.00000007, 0.00000007,
            0.00000007, 0.00000007,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_fnmadd_round_ps() {
        let a = _mm512_set1_ps(0.00000007);
        let b = _mm512_set1_ps(1.);
        let c = _mm512_set1_ps(1.);
        let r = _mm512_maskz_fnmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            0, a, b, c,
        );
        assert_eq_m512(r, _mm512_setzero_ps());
        let r = _mm512_maskz_fnmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            0b00000000_11111111,
            a,
            b,
            c,
        );
        let e = _mm512_setr_ps(
            0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994,
            0.99999994, 0., 0., 0., 0., 0., 0., 0., 0.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask3_fnmadd_round_ps() {
        let a = _mm512_set1_ps(0.00000007);
        let b = _mm512_set1_ps(1.);
        let c = _mm512_set1_ps(1.);
        let r = _mm512_mask3_fnmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a, b, c, 0,
        );
        assert_eq_m512(r, c);
        let r = _mm512_mask3_fnmadd_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a,
            b,
            c,
            0b00000000_11111111,
        );
        let e = _mm512_setr_ps(
            0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994,
            0.99999994, 1., 1., 1., 1., 1., 1., 1., 1.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_fnmsub_round_ps() {
        let a = _mm512_set1_ps(0.00000007);
        let b = _mm512_set1_ps(1.);
        let c = _mm512_set1_ps(-1.);
        let r =
            _mm512_fnmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
        let e = _mm512_set1_ps(0.99999994);
        assert_eq_m512(r, e);
        let r = _mm512_fnmsub_round_ps::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b, c);
        let e = _mm512_set1_ps(0.9999999);
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_fnmsub_round_ps() {
        let a = _mm512_set1_ps(0.00000007);
        let b = _mm512_set1_ps(1.);
        let c = _mm512_set1_ps(-1.);
        let r = _mm512_mask_fnmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a, 0, b, c,
        );
        assert_eq_m512(r, a);
        let r = _mm512_mask_fnmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a,
            0b00000000_11111111,
            b,
            c,
        );
        let e = _mm512_setr_ps(
            0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994,
            0.99999994, 0.00000007, 0.00000007, 0.00000007, 0.00000007, 0.00000007, 0.00000007,
            0.00000007, 0.00000007,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_fnmsub_round_ps() {
        let a = _mm512_set1_ps(0.00000007);
        let b = _mm512_set1_ps(1.);
        let c = _mm512_set1_ps(-1.);
        let r = _mm512_maskz_fnmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            0, a, b, c,
        );
        assert_eq_m512(r, _mm512_setzero_ps());
        let r = _mm512_maskz_fnmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            0b00000000_11111111,
            a,
            b,
            c,
        );
        let e = _mm512_setr_ps(
            0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994,
            0.99999994, 0., 0., 0., 0., 0., 0., 0., 0.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask3_fnmsub_round_ps() {
        let a = _mm512_set1_ps(0.00000007);
        let b = _mm512_set1_ps(1.);
        let c = _mm512_set1_ps(-1.);
        let r = _mm512_mask3_fnmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a, b, c, 0,
        );
        assert_eq_m512(r, c);
        let r = _mm512_mask3_fnmsub_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a,
            b,
            c,
            0b00000000_11111111,
        );
        let e = _mm512_setr_ps(
            0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994, 0.99999994,
            0.99999994, -1., -1., -1., -1., -1., -1., -1., -1.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_max_round_ps() {
        let a = _mm512_setr_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        let b = _mm512_setr_ps(
            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
        );
        let r = _mm512_max_round_ps::<_MM_FROUND_CUR_DIRECTION>(a, b);
        let e = _mm512_setr_ps(
            15., 14., 13., 12., 11., 10., 9., 8., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_max_round_ps() {
        let a = _mm512_setr_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        let b = _mm512_setr_ps(
            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
        );
        let r = _mm512_mask_max_round_ps::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
        assert_eq_m512(r, a);
        let r = _mm512_mask_max_round_ps::<_MM_FROUND_CUR_DIRECTION>(a, 0b00000000_11111111, a, b);
        let e = _mm512_setr_ps(
            15., 14., 13., 12., 11., 10., 9., 8., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_max_round_ps() {
        let a = _mm512_setr_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        let b = _mm512_setr_ps(
            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
        );
        let r = _mm512_maskz_max_round_ps::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
        assert_eq_m512(r, _mm512_setzero_ps());
        let r = _mm512_maskz_max_round_ps::<_MM_FROUND_CUR_DIRECTION>(0b00000000_11111111, a, b);
        let e = _mm512_setr_ps(
            15., 14., 13., 12., 11., 10., 9., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_min_round_ps() {
        let a = _mm512_setr_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        let b = _mm512_setr_ps(
            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
        );
        let r = _mm512_min_round_ps::<_MM_FROUND_CUR_DIRECTION>(a, b);
        let e = _mm512_setr_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 7., 6., 5., 4., 3., 2., 1., 0.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_min_round_ps() {
        let a = _mm512_setr_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        let b = _mm512_setr_ps(
            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
        );
        let r = _mm512_mask_min_round_ps::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
        assert_eq_m512(r, a);
        let r = _mm512_mask_min_round_ps::<_MM_FROUND_CUR_DIRECTION>(a, 0b00000000_11111111, a, b);
        let e = _mm512_setr_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_min_round_ps() {
        let a = _mm512_setr_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        let b = _mm512_setr_ps(
            15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
        );
        let r = _mm512_maskz_min_round_ps::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
        assert_eq_m512(r, _mm512_setzero_ps());
        let r = _mm512_maskz_min_round_ps::<_MM_FROUND_CUR_DIRECTION>(0b00000000_11111111, a, b);
        let e = _mm512_setr_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 0., 0., 0., 0., 0., 0., 0., 0.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_getexp_round_ps() {
        let a = _mm512_set1_ps(3.);
        let r = _mm512_getexp_round_ps::<_MM_FROUND_CUR_DIRECTION>(a);
        let e = _mm512_set1_ps(1.);
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_getexp_round_ps() {
        let a = _mm512_set1_ps(3.);
        let r = _mm512_mask_getexp_round_ps::<_MM_FROUND_CUR_DIRECTION>(a, 0, a);
        assert_eq_m512(r, a);
        let r = _mm512_mask_getexp_round_ps::<_MM_FROUND_CUR_DIRECTION>(a, 0b11111111_00000000, a);
        let e = _mm512_setr_ps(
            3., 3., 3., 3., 3., 3., 3., 3., 1., 1., 1., 1., 1., 1., 1., 1.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_getexp_round_ps() {
        let a = _mm512_set1_ps(3.);
        let r = _mm512_maskz_getexp_round_ps::<_MM_FROUND_CUR_DIRECTION>(0, a);
        assert_eq_m512(r, _mm512_setzero_ps());
        let r = _mm512_maskz_getexp_round_ps::<_MM_FROUND_CUR_DIRECTION>(0b11111111_00000000, a);
        let e = _mm512_setr_ps(
            0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_roundscale_round_ps() {
        let a = _mm512_set1_ps(1.1);
        let r = _mm512_roundscale_round_ps::<0, _MM_FROUND_CUR_DIRECTION>(a);
        let e = _mm512_set1_ps(1.0);
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_roundscale_round_ps() {
        let a = _mm512_set1_ps(1.1);
        let r = _mm512_mask_roundscale_round_ps::<0, _MM_FROUND_CUR_DIRECTION>(a, 0, a);
        let e = _mm512_set1_ps(1.1);
        assert_eq_m512(r, e);
        let r = _mm512_mask_roundscale_round_ps::<0, _MM_FROUND_CUR_DIRECTION>(
            a,
            0b11111111_11111111,
            a,
        );
        let e = _mm512_set1_ps(1.0);
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_roundscale_round_ps() {
        let a = _mm512_set1_ps(1.1);
        let r = _mm512_maskz_roundscale_round_ps::<0, _MM_FROUND_CUR_DIRECTION>(0, a);
        assert_eq_m512(r, _mm512_setzero_ps());
        let r =
            _mm512_maskz_roundscale_round_ps::<0, _MM_FROUND_CUR_DIRECTION>(0b11111111_11111111, a);
        let e = _mm512_set1_ps(1.0);
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_scalef_round_ps() {
        let a = _mm512_set1_ps(1.);
        let b = _mm512_set1_ps(3.);
        let r = _mm512_scalef_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
        let e = _mm512_set1_ps(8.);
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_scalef_round_ps() {
        let a = _mm512_set1_ps(1.);
        let b = _mm512_set1_ps(3.);
        let r = _mm512_mask_scalef_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a, 0, a, b,
        );
        assert_eq_m512(r, a);
        let r = _mm512_mask_scalef_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a,
            0b11111111_00000000,
            a,
            b,
        );
        let e = _mm512_set_ps(
            8., 8., 8., 8., 8., 8., 8., 8., 1., 1., 1., 1., 1., 1., 1., 1.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_scalef_round_ps() {
        let a = _mm512_set1_ps(1.);
        let b = _mm512_set1_ps(3.);
        let r = _mm512_maskz_scalef_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            0, a, b,
        );
        assert_eq_m512(r, _mm512_setzero_ps());
        let r = _mm512_maskz_scalef_round_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            0b11111111_00000000,
            a,
            b,
        );
        let e = _mm512_set_ps(
            8., 8., 8., 8., 8., 8., 8., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_fixupimm_round_ps() {
        let a = _mm512_set1_ps(f32::NAN);
        let b = _mm512_set1_ps(f32::MAX);
        let c = _mm512_set1_epi32(i32::MAX);
        let r = _mm512_fixupimm_round_ps::<5, _MM_FROUND_CUR_DIRECTION>(a, b, c);
        let e = _mm512_set1_ps(0.0);
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_fixupimm_round_ps() {
        #[rustfmt::skip]
        let a = _mm512_set_ps(
            f32::NAN, f32::NAN, f32::NAN, f32::NAN,
            f32::NAN, f32::NAN, f32::NAN, f32::NAN,
            1., 1., 1., 1.,
            1., 1., 1., 1.,
        );
        let b = _mm512_set1_ps(f32::MAX);
        let c = _mm512_set1_epi32(i32::MAX);
        let r = _mm512_mask_fixupimm_round_ps::<5, _MM_FROUND_CUR_DIRECTION>(
            a,
            0b11111111_00000000,
            b,
            c,
        );
        let e = _mm512_set_ps(
            0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_fixupimm_round_ps() {
        #[rustfmt::skip]
        let a = _mm512_set_ps(
            f32::NAN, f32::NAN, f32::NAN, f32::NAN,
            f32::NAN, f32::NAN, f32::NAN, f32::NAN,
            1., 1., 1., 1.,
            1., 1., 1., 1.,
        );
        let b = _mm512_set1_ps(f32::MAX);
        let c = _mm512_set1_epi32(i32::MAX);
        let r = _mm512_maskz_fixupimm_round_ps::<5, _MM_FROUND_CUR_DIRECTION>(
            0b11111111_00000000,
            a,
            b,
            c,
        );
        let e = _mm512_set_ps(
            0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_getmant_round_ps() {
        let a = _mm512_set1_ps(10.);
        let r = _mm512_getmant_round_ps::<
            _MM_MANT_NORM_1_2,
            _MM_MANT_SIGN_SRC,
            _MM_FROUND_CUR_DIRECTION,
        >(a);
        let e = _mm512_set1_ps(1.25);
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_getmant_round_ps() {
        let a = _mm512_set1_ps(10.);
        let r = _mm512_mask_getmant_round_ps::<
            _MM_MANT_NORM_1_2,
            _MM_MANT_SIGN_SRC,
            _MM_FROUND_CUR_DIRECTION,
        >(a, 0, a);
        assert_eq_m512(r, a);
        let r = _mm512_mask_getmant_round_ps::<
            _MM_MANT_NORM_1_2,
            _MM_MANT_SIGN_SRC,
            _MM_FROUND_CUR_DIRECTION,
        >(a, 0b11111111_00000000, a);
        let e = _mm512_setr_ps(
            10., 10., 10., 10., 10., 10., 10., 10., 1.25, 1.25, 1.25, 1.25, 1.25, 1.25, 1.25, 1.25,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_getmant_round_ps() {
        let a = _mm512_set1_ps(10.);
        let r = _mm512_maskz_getmant_round_ps::<
            _MM_MANT_NORM_1_2,
            _MM_MANT_SIGN_SRC,
            _MM_FROUND_CUR_DIRECTION,
        >(0, a);
        assert_eq_m512(r, _mm512_setzero_ps());
        let r = _mm512_maskz_getmant_round_ps::<
            _MM_MANT_NORM_1_2,
            _MM_MANT_SIGN_SRC,
            _MM_FROUND_CUR_DIRECTION,
        >(0b11111111_00000000, a);
        let e = _mm512_setr_ps(
            0., 0., 0., 0., 0., 0., 0., 0., 1.25, 1.25, 1.25, 1.25, 1.25, 1.25, 1.25, 1.25,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_cvtps_epi32() {
        let a = _mm512_setr_ps(
            0., -1.4, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
        );
        let r = _mm512_cvtps_epi32(a);
        let e = _mm512_setr_epi32(0, -1, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_cvtps_epi32() {
        let a = _mm512_setr_ps(
            0., -1.4, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
        );
        let src = _mm512_set1_epi32(0);
        let r = _mm512_mask_cvtps_epi32(src, 0, a);
        assert_eq_m512i(r, src);
        let r = _mm512_mask_cvtps_epi32(src, 0b00000000_11111111, a);
        let e = _mm512_setr_epi32(0, -1, 2, -4, 4, -6, 6, -8, 0, 0, 0, 0, 0, 0, 0, 0);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_cvtps_epi32() {
        let a = _mm512_setr_ps(
            0., -1.4, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
        );
        let r = _mm512_maskz_cvtps_epi32(0, a);
        assert_eq_m512i(r, _mm512_setzero_si512());
        let r = _mm512_maskz_cvtps_epi32(0b00000000_11111111, a);
        let e = _mm512_setr_epi32(0, -1, 2, -4, 4, -6, 6, -8, 0, 0, 0, 0, 0, 0, 0, 0);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_cvtps_epi32() {
        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
        let src = _mm256_set1_epi32(0);
        let r = _mm256_mask_cvtps_epi32(src, 0, a);
        assert_eq_m256i(r, src);
        let r = _mm256_mask_cvtps_epi32(src, 0b11111111, a);
        let e = _mm256_set_epi32(8, 10, 10, 12, 12, 14, 14, 16);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_cvtps_epi32() {
        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
        let r = _mm256_maskz_cvtps_epi32(0, a);
        assert_eq_m256i(r, _mm256_setzero_si256());
        let r = _mm256_maskz_cvtps_epi32(0b11111111, a);
        let e = _mm256_set_epi32(8, 10, 10, 12, 12, 14, 14, 16);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_cvtps_epi32() {
        let a = _mm_set_ps(12., 13.5, 14., 15.5);
        let src = _mm_set1_epi32(0);
        let r = _mm_mask_cvtps_epi32(src, 0, a);
        assert_eq_m128i(r, src);
        let r = _mm_mask_cvtps_epi32(src, 0b00001111, a);
        let e = _mm_set_epi32(12, 14, 14, 16);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_cvtps_epi32() {
        let a = _mm_set_ps(12., 13.5, 14., 15.5);
        let r = _mm_maskz_cvtps_epi32(0, a);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm_maskz_cvtps_epi32(0b00001111, a);
        let e = _mm_set_epi32(12, 14, 14, 16);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_cvtps_epu32() {
        let a = _mm512_setr_ps(
            0., -1.4, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
        );
        let r = _mm512_cvtps_epu32(a);
        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 10, 10, 12, 12, 14, 14, 16);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_cvtps_epu32() {
        let a = _mm512_setr_ps(
            0., -1.4, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
        );
        let src = _mm512_set1_epi32(0);
        let r = _mm512_mask_cvtps_epu32(src, 0, a);
        assert_eq_m512i(r, src);
        let r = _mm512_mask_cvtps_epu32(src, 0b00000000_11111111, a);
        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_cvtps_epu32() {
        let a = _mm512_setr_ps(
            0., -1.4, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
        );
        let r = _mm512_maskz_cvtps_epu32(0, a);
        assert_eq_m512i(r, _mm512_setzero_si512());
        let r = _mm512_maskz_cvtps_epu32(0b00000000_11111111, a);
        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_cvtps_epu32() {
        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
        let r = _mm256_cvtps_epu32(a);
        let e = _mm256_set_epi32(8, 10, 10, 12, 12, 14, 14, 16);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_cvtps_epu32() {
        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
        let src = _mm256_set1_epi32(0);
        let r = _mm256_mask_cvtps_epu32(src, 0, a);
        assert_eq_m256i(r, src);
        let r = _mm256_mask_cvtps_epu32(src, 0b11111111, a);
        let e = _mm256_set_epi32(8, 10, 10, 12, 12, 14, 14, 16);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_cvtps_epu32() {
        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
        let r = _mm256_maskz_cvtps_epu32(0, a);
        assert_eq_m256i(r, _mm256_setzero_si256());
        let r = _mm256_maskz_cvtps_epu32(0b11111111, a);
        let e = _mm256_set_epi32(8, 10, 10, 12, 12, 14, 14, 16);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_cvtps_epu32() {
        let a = _mm_set_ps(12., 13.5, 14., 15.5);
        let r = _mm_cvtps_epu32(a);
        let e = _mm_set_epi32(12, 14, 14, 16);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_cvtps_epu32() {
        let a = _mm_set_ps(12., 13.5, 14., 15.5);
        let src = _mm_set1_epi32(0);
        let r = _mm_mask_cvtps_epu32(src, 0, a);
        assert_eq_m128i(r, src);
        let r = _mm_mask_cvtps_epu32(src, 0b00001111, a);
        let e = _mm_set_epi32(12, 14, 14, 16);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_cvtps_epu32() {
        let a = _mm_set_ps(12., 13.5, 14., 15.5);
        let r = _mm_maskz_cvtps_epu32(0, a);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm_maskz_cvtps_epu32(0b00001111, a);
        let e = _mm_set_epi32(12, 14, 14, 16);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_cvtepi8_epi32() {
        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let r = _mm512_cvtepi8_epi32(a);
        let e = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_cvtepi8_epi32() {
        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let src = _mm512_set1_epi32(-1);
        let r = _mm512_mask_cvtepi8_epi32(src, 0, a);
        assert_eq_m512i(r, src);
        let r = _mm512_mask_cvtepi8_epi32(src, 0b00000000_11111111, a);
        let e = _mm512_set_epi32(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_cvtepi8_epi32() {
        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let r = _mm512_maskz_cvtepi8_epi32(0, a);
        assert_eq_m512i(r, _mm512_setzero_si512());
        let r = _mm512_maskz_cvtepi8_epi32(0b00000000_11111111, a);
        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_cvtepi8_epi32() {
        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let src = _mm256_set1_epi32(-1);
        let r = _mm256_mask_cvtepi8_epi32(src, 0, a);
        assert_eq_m256i(r, src);
        let r = _mm256_mask_cvtepi8_epi32(src, 0b11111111, a);
        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_cvtepi8_epi32() {
        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let r = _mm256_maskz_cvtepi8_epi32(0, a);
        assert_eq_m256i(r, _mm256_setzero_si256());
        let r = _mm256_maskz_cvtepi8_epi32(0b11111111, a);
        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_cvtepi8_epi32() {
        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let src = _mm_set1_epi32(-1);
        let r = _mm_mask_cvtepi8_epi32(src, 0, a);
        assert_eq_m128i(r, src);
        let r = _mm_mask_cvtepi8_epi32(src, 0b00001111, a);
        let e = _mm_set_epi32(12, 13, 14, 15);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_cvtepi8_epi32() {
        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let r = _mm_maskz_cvtepi8_epi32(0, a);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm_maskz_cvtepi8_epi32(0b00001111, a);
        let e = _mm_set_epi32(12, 13, 14, 15);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_cvtepu8_epi32() {
        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let r = _mm512_cvtepu8_epi32(a);
        let e = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_cvtepu8_epi32() {
        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let src = _mm512_set1_epi32(-1);
        let r = _mm512_mask_cvtepu8_epi32(src, 0, a);
        assert_eq_m512i(r, src);
        let r = _mm512_mask_cvtepu8_epi32(src, 0b00000000_11111111, a);
        let e = _mm512_set_epi32(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_cvtepu8_epi32() {
        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let r = _mm512_maskz_cvtepu8_epi32(0, a);
        assert_eq_m512i(r, _mm512_setzero_si512());
        let r = _mm512_maskz_cvtepu8_epi32(0b00000000_11111111, a);
        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_cvtepu8_epi32() {
        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let src = _mm256_set1_epi32(-1);
        let r = _mm256_mask_cvtepu8_epi32(src, 0, a);
        assert_eq_m256i(r, src);
        let r = _mm256_mask_cvtepu8_epi32(src, 0b11111111, a);
        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_cvtepu8_epi32() {
        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let r = _mm256_maskz_cvtepu8_epi32(0, a);
        assert_eq_m256i(r, _mm256_setzero_si256());
        let r = _mm256_maskz_cvtepu8_epi32(0b11111111, a);
        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_cvtepu8_epi32() {
        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let src = _mm_set1_epi32(-1);
        let r = _mm_mask_cvtepu8_epi32(src, 0, a);
        assert_eq_m128i(r, src);
        let r = _mm_mask_cvtepu8_epi32(src, 0b00001111, a);
        let e = _mm_set_epi32(12, 13, 14, 15);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_cvtepu8_epi32() {
        let a = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let r = _mm_maskz_cvtepu8_epi32(0, a);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm_maskz_cvtepu8_epi32(0b00001111, a);
        let e = _mm_set_epi32(12, 13, 14, 15);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_cvtepi16_epi32() {
        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let r = _mm512_cvtepi16_epi32(a);
        let e = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_cvtepi16_epi32() {
        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let src = _mm512_set1_epi32(-1);
        let r = _mm512_mask_cvtepi16_epi32(src, 0, a);
        assert_eq_m512i(r, src);
        let r = _mm512_mask_cvtepi16_epi32(src, 0b00000000_11111111, a);
        let e = _mm512_set_epi32(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_cvtepi16_epi32() {
        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let r = _mm512_maskz_cvtepi16_epi32(0, a);
        assert_eq_m512i(r, _mm512_setzero_si512());
        let r = _mm512_maskz_cvtepi16_epi32(0b00000000_11111111, a);
        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_cvtepi16_epi32() {
        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
        let src = _mm256_set1_epi32(-1);
        let r = _mm256_mask_cvtepi16_epi32(src, 0, a);
        assert_eq_m256i(r, src);
        let r = _mm256_mask_cvtepi16_epi32(src, 0b11111111, a);
        let e = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_cvtepi16_epi32() {
        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
        let r = _mm256_maskz_cvtepi16_epi32(0, a);
        assert_eq_m256i(r, _mm256_setzero_si256());
        let r = _mm256_maskz_cvtepi16_epi32(0b11111111, a);
        let e = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_cvtepi16_epi32() {
        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
        let src = _mm_set1_epi32(-1);
        let r = _mm_mask_cvtepi16_epi32(src, 0, a);
        assert_eq_m128i(r, src);
        let r = _mm_mask_cvtepi16_epi32(src, 0b00001111, a);
        let e = _mm_set_epi32(4, 5, 6, 7);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_cvtepi16_epi32() {
        let a = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
        let r = _mm_maskz_cvtepi16_epi32(0, a);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm_maskz_cvtepi16_epi32(0b00001111, a);
        let e = _mm_set_epi32(4, 5, 6, 7);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_cvtepu16_epi32() {
        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let r = _mm512_cvtepu16_epi32(a);
        let e = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_cvtepu16_epi32() {
        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let src = _mm512_set1_epi32(-1);
        let r = _mm512_mask_cvtepu16_epi32(src, 0, a);
        assert_eq_m512i(r, src);
        let r = _mm512_mask_cvtepu16_epi32(src, 0b00000000_11111111, a);
        let e = _mm512_set_epi32(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_cvtepu16_epi32() {
        let a = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let r = _mm512_maskz_cvtepu16_epi32(0, a);
        assert_eq_m512i(r, _mm512_setzero_si512());
        let r = _mm512_maskz_cvtepu16_epi32(0b00000000_11111111, a);
        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_cvtepu16_epi32() {
        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
        let src = _mm256_set1_epi32(-1);
        let r = _mm256_mask_cvtepu16_epi32(src, 0, a);
        assert_eq_m256i(r, src);
        let r = _mm256_mask_cvtepu16_epi32(src, 0b11111111, a);
        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_cvtepu16_epi32() {
        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
        let r = _mm256_maskz_cvtepu16_epi32(0, a);
        assert_eq_m256i(r, _mm256_setzero_si256());
        let r = _mm256_maskz_cvtepu16_epi32(0b11111111, a);
        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_cvtepu16_epi32() {
        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
        let src = _mm_set1_epi32(-1);
        let r = _mm_mask_cvtepu16_epi32(src, 0, a);
        assert_eq_m128i(r, src);
        let r = _mm_mask_cvtepu16_epi32(src, 0b00001111, a);
        let e = _mm_set_epi32(12, 13, 14, 15);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_cvtepu16_epi32() {
        let a = _mm_set_epi16(8, 9, 10, 11, 12, 13, 14, 15);
        let r = _mm_maskz_cvtepu16_epi32(0, a);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm_maskz_cvtepu16_epi32(0b00001111, a);
        let e = _mm_set_epi32(12, 13, 14, 15);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_cvtepi32_ps() {
        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let r = _mm512_cvtepi32_ps(a);
        let e = _mm512_set_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_cvtepi32_ps() {
        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let src = _mm512_set1_ps(-1.);
        let r = _mm512_mask_cvtepi32_ps(src, 0, a);
        assert_eq_m512(r, src);
        let r = _mm512_mask_cvtepi32_ps(src, 0b00000000_11111111, a);
        let e = _mm512_set_ps(
            -1., -1., -1., -1., -1., -1., -1., -1., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_cvtepi32_ps() {
        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let r = _mm512_maskz_cvtepi32_ps(0, a);
        assert_eq_m512(r, _mm512_setzero_ps());
        let r = _mm512_maskz_cvtepi32_ps(0b00000000_11111111, a);
        let e = _mm512_set_ps(
            0., 0., 0., 0., 0., 0., 0., 0., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_cvtepi32_ps() {
        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
        let src = _mm256_set1_ps(-1.);
        let r = _mm256_mask_cvtepi32_ps(src, 0, a);
        assert_eq_m256(r, src);
        let r = _mm256_mask_cvtepi32_ps(src, 0b11111111, a);
        let e = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_cvtepi32_ps() {
        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
        let r = _mm256_maskz_cvtepi32_ps(0, a);
        assert_eq_m256(r, _mm256_setzero_ps());
        let r = _mm256_maskz_cvtepi32_ps(0b11111111, a);
        let e = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_cvtepi32_ps() {
        let a = _mm_set_epi32(1, 2, 3, 4);
        let src = _mm_set1_ps(-1.);
        let r = _mm_mask_cvtepi32_ps(src, 0, a);
        assert_eq_m128(r, src);
        let r = _mm_mask_cvtepi32_ps(src, 0b00001111, a);
        let e = _mm_set_ps(1., 2., 3., 4.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_cvtepi32_ps() {
        let a = _mm_set_epi32(1, 2, 3, 4);
        let r = _mm_maskz_cvtepi32_ps(0, a);
        assert_eq_m128(r, _mm_setzero_ps());
        let r = _mm_maskz_cvtepi32_ps(0b00001111, a);
        let e = _mm_set_ps(1., 2., 3., 4.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_cvtepu32_ps() {
        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let r = _mm512_cvtepu32_ps(a);
        let e = _mm512_set_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_cvtepu32_ps() {
        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let src = _mm512_set1_ps(-1.);
        let r = _mm512_mask_cvtepu32_ps(src, 0, a);
        assert_eq_m512(r, src);
        let r = _mm512_mask_cvtepu32_ps(src, 0b00000000_11111111, a);
        let e = _mm512_set_ps(
            -1., -1., -1., -1., -1., -1., -1., -1., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_cvtepu32_ps() {
        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let r = _mm512_maskz_cvtepu32_ps(0, a);
        assert_eq_m512(r, _mm512_setzero_ps());
        let r = _mm512_maskz_cvtepu32_ps(0b00000000_11111111, a);
        let e = _mm512_set_ps(
            0., 0., 0., 0., 0., 0., 0., 0., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_cvtepi32_epi16() {
        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let r = _mm512_cvtepi32_epi16(a);
        let e = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_cvtepi32_epi16() {
        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let src = _mm256_set1_epi16(-1);
        let r = _mm512_mask_cvtepi32_epi16(src, 0, a);
        assert_eq_m256i(r, src);
        let r = _mm512_mask_cvtepi32_epi16(src, 0b00000000_11111111, a);
        let e = _mm256_set_epi16(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_cvtepi32_epi16() {
        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let r = _mm512_maskz_cvtepi32_epi16(0, a);
        assert_eq_m256i(r, _mm256_setzero_si256());
        let r = _mm512_maskz_cvtepi32_epi16(0b00000000_11111111, a);
        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_cvtepi32_epi16() {
        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
        let r = _mm256_cvtepi32_epi16(a);
        let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_cvtepi32_epi16() {
        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
        let src = _mm_set1_epi16(-1);
        let r = _mm256_mask_cvtepi32_epi16(src, 0, a);
        assert_eq_m128i(r, src);
        let r = _mm256_mask_cvtepi32_epi16(src, 0b11111111, a);
        let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_cvtepi32_epi16() {
        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
        let r = _mm256_maskz_cvtepi32_epi16(0, a);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm256_maskz_cvtepi32_epi16(0b11111111, a);
        let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_cvtepi32_epi16() {
        let a = _mm_set_epi32(4, 5, 6, 7);
        let r = _mm_cvtepi32_epi16(a);
        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_cvtepi32_epi16() {
        let a = _mm_set_epi32(4, 5, 6, 7);
        let src = _mm_set1_epi16(0);
        let r = _mm_mask_cvtepi32_epi16(src, 0, a);
        assert_eq_m128i(r, src);
        let r = _mm_mask_cvtepi32_epi16(src, 0b00001111, a);
        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_cvtepi32_epi16() {
        let a = _mm_set_epi32(4, 5, 6, 7);
        let r = _mm_maskz_cvtepi32_epi16(0, a);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm_maskz_cvtepi32_epi16(0b00001111, a);
        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_cvtepi32_epi8() {
        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let r = _mm512_cvtepi32_epi8(a);
        let e = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_cvtepi32_epi8() {
        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let src = _mm_set1_epi8(-1);
        let r = _mm512_mask_cvtepi32_epi8(src, 0, a);
        assert_eq_m128i(r, src);
        let r = _mm512_mask_cvtepi32_epi8(src, 0b00000000_11111111, a);
        let e = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_cvtepi32_epi8() {
        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let r = _mm512_maskz_cvtepi32_epi8(0, a);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm512_maskz_cvtepi32_epi8(0b00000000_11111111, a);
        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, 14, 15);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_cvtepi32_epi8() {
        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
        let r = _mm256_cvtepi32_epi8(a);
        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_cvtepi32_epi8() {
        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
        let src = _mm_set1_epi8(0);
        let r = _mm256_mask_cvtepi32_epi8(src, 0, a);
        assert_eq_m128i(r, src);
        let r = _mm256_mask_cvtepi32_epi8(src, 0b11111111, a);
        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_cvtepi32_epi8() {
        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
        let r = _mm256_maskz_cvtepi32_epi8(0, a);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm256_maskz_cvtepi32_epi8(0b11111111, a);
        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_cvtepi32_epi8() {
        let a = _mm_set_epi32(4, 5, 6, 7);
        let r = _mm_cvtepi32_epi8(a);
        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 6, 7);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_cvtepi32_epi8() {
        let a = _mm_set_epi32(4, 5, 6, 7);
        let src = _mm_set1_epi8(0);
        let r = _mm_mask_cvtepi32_epi8(src, 0, a);
        assert_eq_m128i(r, src);
        let r = _mm_mask_cvtepi32_epi8(src, 0b00001111, a);
        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 6, 7);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_cvtepi32_epi8() {
        let a = _mm_set_epi32(4, 5, 6, 7);
        let r = _mm_maskz_cvtepi32_epi8(0, a);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm_maskz_cvtepi32_epi8(0b00001111, a);
        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 5, 6, 7);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_cvtsepi32_epi16() {
        #[rustfmt::skip]
        let a = _mm512_set_epi32(
            0, 1, 2, 3,
            4, 5, 6, 7,
            8, 9, 10, 11,
            12, 13, i32::MIN, i32::MAX,
        );
        let r = _mm512_cvtsepi32_epi16(a);
        #[rustfmt::skip]
        let e = _mm256_set_epi16(
            0, 1, 2, 3,
            4, 5, 6, 7,
            8, 9, 10, 11,
            12, 13, i16::MIN, i16::MAX,
        );
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_cvtsepi32_epi16() {
        #[rustfmt::skip]
        let a = _mm512_set_epi32(
            0, 1, 2, 3,
            4, 5, 6, 7,
            8, 9, 10, 11,
            12, 13, i32::MIN, i32::MAX,
        );
        let src = _mm256_set1_epi16(-1);
        let r = _mm512_mask_cvtsepi32_epi16(src, 0, a);
        assert_eq_m256i(r, src);
        let r = _mm512_mask_cvtsepi32_epi16(src, 0b00000000_11111111, a);
        #[rustfmt::skip]
        let e = _mm256_set_epi16(
            -1, -1, -1, -1,
            -1, -1, -1, -1,
            8, 9, 10, 11,
            12, 13, i16::MIN, i16::MAX,
        );
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_cvtsepi32_epi16() {
        #[rustfmt::skip]
        let a = _mm512_set_epi32(
            0, 1, 2, 3,
            4, 5, 6, 7,
            8, 9, 10, 11,
            12, 13, i32::MIN, i32::MAX,
        );
        let r = _mm512_maskz_cvtsepi32_epi16(0, a);
        assert_eq_m256i(r, _mm256_setzero_si256());
        let r = _mm512_maskz_cvtsepi32_epi16(0b00000000_11111111, a);
        #[rustfmt::skip]
        let e = _mm256_set_epi16(
            0, 0, 0, 0,
            0, 0, 0, 0,
            8, 9, 10, 11,
            12, 13, i16::MIN, i16::MAX,
        );
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_cvtsepi32_epi16() {
        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
        let r = _mm256_cvtsepi32_epi16(a);
        let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_cvtsepi32_epi16() {
        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
        let src = _mm_set1_epi16(-1);
        let r = _mm256_mask_cvtsepi32_epi16(src, 0, a);
        assert_eq_m128i(r, src);
        let r = _mm256_mask_cvtsepi32_epi16(src, 0b11111111, a);
        let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_cvtsepi32_epi16() {
        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
        let r = _mm256_maskz_cvtsepi32_epi16(0, a);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm256_maskz_cvtsepi32_epi16(0b11111111, a);
        let e = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_cvtsepi32_epi16() {
        let a = _mm_set_epi32(4, 5, 6, 7);
        let r = _mm_cvtsepi32_epi16(a);
        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_cvtsepi32_epi16() {
        let a = _mm_set_epi32(4, 5, 6, 7);
        let src = _mm_set1_epi16(0);
        let r = _mm_mask_cvtsepi32_epi16(src, 0, a);
        assert_eq_m128i(r, src);
        let r = _mm_mask_cvtsepi32_epi16(src, 0b11111111, a);
        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_cvtsepi32_epi16() {
        let a = _mm_set_epi32(4, 5, 6, 7);
        let r = _mm_maskz_cvtsepi32_epi16(0, a);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm_maskz_cvtsepi32_epi16(0b11111111, a);
        let e = _mm_set_epi16(0, 0, 0, 0, 4, 5, 6, 7);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_cvtsepi32_epi8() {
        #[rustfmt::skip]
        let a = _mm512_set_epi32(
            0, 1, 2, 3,
            4, 5, 6, 7,
            8, 9, 10, 11,
            12, 13, i32::MIN, i32::MAX,
        );
        let r = _mm512_cvtsepi32_epi8(a);
        #[rustfmt::skip]
        let e = _mm_set_epi8(
            0, 1, 2, 3,
            4, 5, 6, 7,
            8, 9, 10, 11,
            12, 13, i8::MIN, i8::MAX,
        );
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_cvtsepi32_epi8() {
        #[rustfmt::skip]
        let a = _mm512_set_epi32(
            0, 1, 2, 3,
            4, 5, 6, 7,
            8, 9, 10, 11,
            12, 13, i32::MIN, i32::MAX,
        );
        let src = _mm_set1_epi8(-1);
        let r = _mm512_mask_cvtsepi32_epi8(src, 0, a);
        assert_eq_m128i(r, src);
        let r = _mm512_mask_cvtsepi32_epi8(src, 0b00000000_11111111, a);
        #[rustfmt::skip]
        let e = _mm_set_epi8(
            -1, -1, -1, -1,
            -1, -1, -1, -1,
            8, 9, 10, 11,
            12, 13, i8::MIN, i8::MAX,
        );
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_cvtsepi32_epi8() {
        #[rustfmt::skip]
        let a = _mm512_set_epi32(
            0, 1, 2, 3,
            4, 5, 6, 7,
            8, 9, 10, 11,
            12, 13, i32::MIN, i32::MAX,
        );
        let r = _mm512_maskz_cvtsepi32_epi8(0, a);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm512_maskz_cvtsepi32_epi8(0b00000000_11111111, a);
        #[rustfmt::skip]
        let e = _mm_set_epi8(
            0, 0, 0, 0,
            0, 0, 0, 0,
            8, 9, 10, 11,
            12, 13, i8::MIN, i8::MAX,
        );
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_cvtsepi32_epi8() {
        let a = _mm256_set_epi32(9, 10, 11, 12, 13, 14, 15, 16);
        let r = _mm256_cvtsepi32_epi8(a);
        #[rustfmt::skip]
        let e = _mm_set_epi8(
            0, 0, 0, 0,
            0, 0, 0, 0,
            9, 10, 11, 12,
            13, 14, 15, 16,
        );
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_cvtsepi32_epi8() {
        let a = _mm256_set_epi32(9, 10, 11, 12, 13, 14, 15, 16);
        let src = _mm_set1_epi8(0);
        let r = _mm256_mask_cvtsepi32_epi8(src, 0, a);
        assert_eq_m128i(r, src);
        let r = _mm256_mask_cvtsepi32_epi8(src, 0b11111111, a);
        #[rustfmt::skip]
        let e = _mm_set_epi8(
            0, 0, 0, 0,
            0, 0, 0, 0,
            9, 10, 11, 12,
            13, 14, 15, 16,
        );
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_cvtsepi32_epi8() {
        let a = _mm256_set_epi32(9, 10, 11, 12, 13, 14, 15, 16);
        let r = _mm256_maskz_cvtsepi32_epi8(0, a);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm256_maskz_cvtsepi32_epi8(0b11111111, a);
        #[rustfmt::skip]
        let e = _mm_set_epi8(
            0, 0, 0, 0,
            0, 0, 0, 0,
            9, 10, 11, 12,
            13, 14, 15, 16,
        );
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_cvtsepi32_epi8() {
        let a = _mm_set_epi32(13, 14, 15, 16);
        let r = _mm_cvtsepi32_epi8(a);
        #[rustfmt::skip]
        let e = _mm_set_epi8(
            0, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 0,
            13, 14, 15, 16,
        );
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_cvtsepi32_epi8() {
        let a = _mm_set_epi32(13, 14, 15, 16);
        let src = _mm_set1_epi8(0);
        let r = _mm_mask_cvtsepi32_epi8(src, 0, a);
        assert_eq_m128i(r, src);
        let r = _mm_mask_cvtsepi32_epi8(src, 0b00001111, a);
        #[rustfmt::skip]
        let e = _mm_set_epi8(
            0, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 0,
            13, 14, 15, 16,
        );
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_cvtsepi32_epi8() {
        let a = _mm_set_epi32(13, 14, 15, 16);
        let r = _mm_maskz_cvtsepi32_epi8(0, a);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm_maskz_cvtsepi32_epi8(0b00001111, a);
        #[rustfmt::skip]
        let e = _mm_set_epi8(
            0, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 0,
            13, 14, 15, 16,
        );
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_cvtusepi32_epi16() {
        #[rustfmt::skip]
        let a = _mm512_set_epi32(
            0, 1, 2, 3,
            4, 5, 6, 7,
            8, 9, 10, 11,
            12, 13, i32::MIN, i32::MIN,
        );
        let r = _mm512_cvtusepi32_epi16(a);
        let e = _mm256_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_cvtusepi32_epi16() {
        #[rustfmt::skip]
        let a = _mm512_set_epi32(
            0, 1, 2, 3,
            4, 5, 6, 7,
            8, 9, 10, 11,
            12, 13, i32::MIN, i32::MIN,
        );
        let src = _mm256_set1_epi16(-1);
        let r = _mm512_mask_cvtusepi32_epi16(src, 0, a);
        assert_eq_m256i(r, src);
        let r = _mm512_mask_cvtusepi32_epi16(src, 0b00000000_11111111, a);
        let e = _mm256_set_epi16(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, -1, -1);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_cvtusepi32_epi16() {
        #[rustfmt::skip]
        let a = _mm512_set_epi32(
            0, 1, 2, 3,
            4, 5, 6, 7,
            8, 9, 10, 11,
            12, 13, i32::MIN, i32::MIN,
        );
        let r = _mm512_maskz_cvtusepi32_epi16(0, a);
        assert_eq_m256i(r, _mm256_setzero_si256());
        let r = _mm512_maskz_cvtusepi32_epi16(0b00000000_11111111, a);
        let e = _mm256_set_epi16(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, -1, -1);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_cvtusepi32_epi16() {
        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
        let r = _mm256_cvtusepi32_epi16(a);
        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_cvtusepi32_epi16() {
        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
        let src = _mm_set1_epi16(0);
        let r = _mm256_mask_cvtusepi32_epi16(src, 0, a);
        assert_eq_m128i(r, src);
        let r = _mm256_mask_cvtusepi32_epi16(src, 0b11111111, a);
        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_cvtusepi32_epi16() {
        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
        let r = _mm256_maskz_cvtusepi32_epi16(0, a);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm256_maskz_cvtusepi32_epi16(0b11111111, a);
        let e = _mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_cvtusepi32_epi16() {
        let a = _mm_set_epi32(5, 6, 7, 8);
        let r = _mm_cvtusepi32_epi16(a);
        let e = _mm_set_epi16(0, 0, 0, 0, 5, 6, 7, 8);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_cvtusepi32_epi16() {
        let a = _mm_set_epi32(5, 6, 7, 8);
        let src = _mm_set1_epi16(0);
        let r = _mm_mask_cvtusepi32_epi16(src, 0, a);
        assert_eq_m128i(r, src);
        let r = _mm_mask_cvtusepi32_epi16(src, 0b00001111, a);
        let e = _mm_set_epi16(0, 0, 0, 0, 5, 6, 7, 8);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_cvtusepi32_epi16() {
        let a = _mm_set_epi32(5, 6, 7, 8);
        let r = _mm_maskz_cvtusepi32_epi16(0, a);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm_maskz_cvtusepi32_epi16(0b00001111, a);
        let e = _mm_set_epi16(0, 0, 0, 0, 5, 6, 7, 8);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_cvtusepi32_epi8() {
        #[rustfmt::skip]
        let a = _mm512_set_epi32(
            0, 1, 2, 3,
            4, 5, 6, 7,
            8, 9, 10, 11,
            12, 13, i32::MIN, i32::MIN,
        );
        let r = _mm512_cvtusepi32_epi8(a);
        let e = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1, -1);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_cvtusepi32_epi8() {
        #[rustfmt::skip]
        let a = _mm512_set_epi32(
            0, 1, 2, 3,
            4, 5, 6, 7,
            8, 9, 10, 11,
            12, 13, i32::MIN, i32::MIN,
        );
        let src = _mm_set1_epi8(-1);
        let r = _mm512_mask_cvtusepi32_epi8(src, 0, a);
        assert_eq_m128i(r, src);
        let r = _mm512_mask_cvtusepi32_epi8(src, 0b00000000_11111111, a);
        let e = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, -1, -1);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_cvtusepi32_epi8() {
        #[rustfmt::skip]
        let a = _mm512_set_epi32(
            0, 1, 2, 3,
            4, 5, 6, 7,
            8, 9, 10, 11,
            12, 13, i32::MIN, i32::MIN,
        );
        let r = _mm512_maskz_cvtusepi32_epi8(0, a);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm512_maskz_cvtusepi32_epi8(0b00000000_11111111, a);
        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 8, 9, 10, 11, 12, 13, -1, -1);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_cvtusepi32_epi8() {
        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, i32::MAX);
        let r = _mm256_cvtusepi32_epi8(a);
        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, u8::MAX as i8);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_cvtusepi32_epi8() {
        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, i32::MAX);
        let src = _mm_set1_epi8(0);
        let r = _mm256_mask_cvtusepi32_epi8(src, 0, a);
        assert_eq_m128i(r, src);
        let r = _mm256_mask_cvtusepi32_epi8(src, 0b11111111, a);
        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, u8::MAX as i8);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_cvtusepi32_epi8() {
        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, i32::MAX);
        let r = _mm256_maskz_cvtusepi32_epi8(0, a);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm256_maskz_cvtusepi32_epi8(0b11111111, a);
        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, u8::MAX as i8);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_cvtusepi32_epi8() {
        let a = _mm_set_epi32(5, 6, 7, i32::MAX);
        let r = _mm_cvtusepi32_epi8(a);
        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 7, u8::MAX as i8);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_cvtusepi32_epi8() {
        let a = _mm_set_epi32(5, 6, 7, i32::MAX);
        let src = _mm_set1_epi8(0);
        let r = _mm_mask_cvtusepi32_epi8(src, 0, a);
        assert_eq_m128i(r, src);
        let r = _mm_mask_cvtusepi32_epi8(src, 0b00001111, a);
        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 7, u8::MAX as i8);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_cvtusepi32_epi8() {
        let a = _mm_set_epi32(5, 6, 7, i32::MAX);
        let r = _mm_maskz_cvtusepi32_epi8(0, a);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm_maskz_cvtusepi32_epi8(0b00001111, a);
        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 6, 7, u8::MAX as i8);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_cvt_roundps_epi32() {
        let a = _mm512_setr_ps(
            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
        );
        let r = _mm512_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
        let e = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
        assert_eq_m512i(r, e);
        let r = _mm512_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a);
        let e = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 9, 10, 11, 12, 13, 14, 15);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_cvt_roundps_epi32() {
        let a = _mm512_setr_ps(
            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
        );
        let src = _mm512_set1_epi32(0);
        let r = _mm512_mask_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            src, 0, a,
        );
        assert_eq_m512i(r, src);
        let r = _mm512_mask_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            src,
            0b00000000_11111111,
            a,
        );
        let e = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 0, 0, 0, 0, 0, 0, 0, 0);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_cvt_roundps_epi32() {
        let a = _mm512_setr_ps(
            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
        );
        let r = _mm512_maskz_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            0, a,
        );
        assert_eq_m512i(r, _mm512_setzero_si512());
        let r = _mm512_maskz_cvt_roundps_epi32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            0b00000000_11111111,
            a,
        );
        let e = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 0, 0, 0, 0, 0, 0, 0, 0);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_cvt_roundps_epu32() {
        let a = _mm512_setr_ps(
            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
        );
        let r = _mm512_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 10, 10, 12, 12, 14, 14, 16);
        assert_eq_m512i(r, e);
        let r = _mm512_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a);
        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 9, 10, 11, 12, 13, 14, 15);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_cvt_roundps_epu32() {
        let a = _mm512_setr_ps(
            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
        );
        let src = _mm512_set1_epi32(0);
        let r = _mm512_mask_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            src, 0, a,
        );
        assert_eq_m512i(r, src);
        let r = _mm512_mask_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            src,
            0b00000000_11111111,
            a,
        );
        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_cvt_roundps_epu32() {
        let a = _mm512_setr_ps(
            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
        );
        let r = _mm512_maskz_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            0, a,
        );
        assert_eq_m512i(r, _mm512_setzero_si512());
        let r = _mm512_maskz_cvt_roundps_epu32::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            0b00000000_11111111,
            a,
        );
        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_cvt_roundepi32_ps() {
        let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
        let r = _mm512_cvt_roundepi32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
        let e = _mm512_setr_ps(
            0., -2., 2., -4., 4., -6., 6., -8., 8., 10., 10., 12., 12., 14., 14., 16.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_cvt_roundepi32_ps() {
        let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
        let src = _mm512_set1_ps(0.);
        let r = _mm512_mask_cvt_roundepi32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            src, 0, a,
        );
        assert_eq_m512(r, src);
        let r = _mm512_mask_cvt_roundepi32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            src,
            0b00000000_11111111,
            a,
        );
        let e = _mm512_setr_ps(
            0., -2., 2., -4., 4., -6., 6., -8., 0., 0., 0., 0., 0., 0., 0., 0.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_cvt_roundepi32_ps() {
        let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
        let r = _mm512_maskz_cvt_roundepi32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            0, a,
        );
        assert_eq_m512(r, _mm512_setzero_ps());
        let r = _mm512_maskz_cvt_roundepi32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            0b00000000_11111111,
            a,
        );
        let e = _mm512_setr_ps(
            0., -2., 2., -4., 4., -6., 6., -8., 0., 0., 0., 0., 0., 0., 0., 0.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_cvt_roundepu32_ps() {
        let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
        let r = _mm512_cvt_roundepu32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a);
        #[rustfmt::skip]
        let e = _mm512_setr_ps(
            0., 4294967300., 2., 4294967300.,
            4., 4294967300., 6., 4294967300.,
            8., 10., 10., 12.,
            12., 14., 14., 16.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_cvt_roundepu32_ps() {
        let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
        let src = _mm512_set1_ps(0.);
        let r = _mm512_mask_cvt_roundepu32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            src, 0, a,
        );
        assert_eq_m512(r, src);
        let r = _mm512_mask_cvt_roundepu32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            src,
            0b00000000_11111111,
            a,
        );
        #[rustfmt::skip]
        let e = _mm512_setr_ps(
            0., 4294967300., 2., 4294967300.,
            4., 4294967300., 6., 4294967300.,
            0., 0., 0., 0.,
            0., 0., 0., 0.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_cvt_roundepu32_ps() {
        let a = _mm512_setr_epi32(0, -2, 2, -4, 4, -6, 6, -8, 8, 10, 10, 12, 12, 14, 14, 16);
        let r = _mm512_maskz_cvt_roundepu32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            0, a,
        );
        assert_eq_m512(r, _mm512_setzero_ps());
        let r = _mm512_maskz_cvt_roundepu32_ps::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            0b00000000_11111111,
            a,
        );
        #[rustfmt::skip]
        let e = _mm512_setr_ps(
            0., 4294967300., 2., 4294967300.,
            4., 4294967300., 6., 4294967300.,
            0., 0., 0., 0.,
            0., 0., 0., 0.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_cvt_roundps_ph() {
        let a = _mm512_set1_ps(1.);
        let r = _mm512_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(a);
        let e = _mm256_setr_epi64x(
            4323521613979991040,
            4323521613979991040,
            4323521613979991040,
            4323521613979991040,
        );
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_cvt_roundps_ph() {
        let a = _mm512_set1_ps(1.);
        let src = _mm256_set1_epi16(0);
        let r = _mm512_mask_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(src, 0, a);
        assert_eq_m256i(r, src);
        let r = _mm512_mask_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(src, 0b00000000_11111111, a);
        let e = _mm256_setr_epi64x(4323521613979991040, 4323521613979991040, 0, 0);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_cvt_roundps_ph() {
        let a = _mm512_set1_ps(1.);
        let r = _mm512_maskz_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(0, a);
        assert_eq_m256i(r, _mm256_setzero_si256());
        let r = _mm512_maskz_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(0b00000000_11111111, a);
        let e = _mm256_setr_epi64x(4323521613979991040, 4323521613979991040, 0, 0);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_cvt_roundps_ph() {
        let a = _mm256_set1_ps(1.);
        let src = _mm_set1_epi16(0);
        let r = _mm256_mask_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(src, 0, a);
        assert_eq_m128i(r, src);
        let r = _mm256_mask_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(src, 0b11111111, a);
        let e = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_cvt_roundps_ph() {
        let a = _mm256_set1_ps(1.);
        let r = _mm256_maskz_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(0, a);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm256_maskz_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(0b11111111, a);
        let e = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_cvt_roundps_ph() {
        let a = _mm_set1_ps(1.);
        let src = _mm_set1_epi16(0);
        let r = _mm_mask_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(src, 0, a);
        assert_eq_m128i(r, src);
        let r = _mm_mask_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(src, 0b00001111, a);
        let e = _mm_setr_epi64x(4323521613979991040, 0);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_cvt_roundps_ph() {
        let a = _mm_set1_ps(1.);
        let r = _mm_maskz_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(0, a);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm_maskz_cvt_roundps_ph::<_MM_FROUND_NO_EXC>(0b00001111, a);
        let e = _mm_setr_epi64x(4323521613979991040, 0);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_cvtps_ph() {
        let a = _mm512_set1_ps(1.);
        let r = _mm512_cvtps_ph::<_MM_FROUND_NO_EXC>(a);
        let e = _mm256_setr_epi64x(
            4323521613979991040,
            4323521613979991040,
            4323521613979991040,
            4323521613979991040,
        );
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_cvtps_ph() {
        let a = _mm512_set1_ps(1.);
        let src = _mm256_set1_epi16(0);
        let r = _mm512_mask_cvtps_ph::<_MM_FROUND_NO_EXC>(src, 0, a);
        assert_eq_m256i(r, src);
        let r = _mm512_mask_cvtps_ph::<_MM_FROUND_NO_EXC>(src, 0b00000000_11111111, a);
        let e = _mm256_setr_epi64x(4323521613979991040, 4323521613979991040, 0, 0);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_cvtps_ph() {
        let a = _mm512_set1_ps(1.);
        let r = _mm512_maskz_cvtps_ph::<_MM_FROUND_NO_EXC>(0, a);
        assert_eq_m256i(r, _mm256_setzero_si256());
        let r = _mm512_maskz_cvtps_ph::<_MM_FROUND_NO_EXC>(0b00000000_11111111, a);
        let e = _mm256_setr_epi64x(4323521613979991040, 4323521613979991040, 0, 0);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_cvtps_ph() {
        let a = _mm256_set1_ps(1.);
        let src = _mm_set1_epi16(0);
        let r = _mm256_mask_cvtps_ph::<_MM_FROUND_NO_EXC>(src, 0, a);
        assert_eq_m128i(r, src);
        let r = _mm256_mask_cvtps_ph::<_MM_FROUND_NO_EXC>(src, 0b11111111, a);
        let e = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_cvtps_ph() {
        let a = _mm256_set1_ps(1.);
        let r = _mm256_maskz_cvtps_ph::<_MM_FROUND_NO_EXC>(0, a);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm256_maskz_cvtps_ph::<_MM_FROUND_NO_EXC>(0b11111111, a);
        let e = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_cvtps_ph() {
        let a = _mm_set1_ps(1.);
        let src = _mm_set1_epi16(0);
        let r = _mm_mask_cvtps_ph::<_MM_FROUND_NO_EXC>(src, 0, a);
        assert_eq_m128i(r, src);
        let r = _mm_mask_cvtps_ph::<_MM_FROUND_NO_EXC>(src, 0b00001111, a);
        let e = _mm_setr_epi64x(4323521613979991040, 0);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_cvtps_ph() {
        let a = _mm_set1_ps(1.);
        let r = _mm_maskz_cvtps_ph::<_MM_FROUND_NO_EXC>(0, a);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm_maskz_cvtps_ph::<_MM_FROUND_NO_EXC>(0b00001111, a);
        let e = _mm_setr_epi64x(4323521613979991040, 0);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_cvt_roundph_ps() {
        let a = _mm256_setr_epi64x(
            4323521613979991040,
            4323521613979991040,
            4323521613979991040,
            4323521613979991040,
        );
        let r = _mm512_cvt_roundph_ps::<_MM_FROUND_NO_EXC>(a);
        let e = _mm512_set1_ps(1.);
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_cvt_roundph_ps() {
        let a = _mm256_setr_epi64x(
            4323521613979991040,
            4323521613979991040,
            4323521613979991040,
            4323521613979991040,
        );
        let src = _mm512_set1_ps(0.);
        let r = _mm512_mask_cvt_roundph_ps::<_MM_FROUND_NO_EXC>(src, 0, a);
        assert_eq_m512(r, src);
        let r = _mm512_mask_cvt_roundph_ps::<_MM_FROUND_NO_EXC>(src, 0b00000000_11111111, a);
        let e = _mm512_setr_ps(
            1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_cvt_roundph_ps() {
        let a = _mm256_setr_epi64x(
            4323521613979991040,
            4323521613979991040,
            4323521613979991040,
            4323521613979991040,
        );
        let r = _mm512_maskz_cvt_roundph_ps::<_MM_FROUND_NO_EXC>(0, a);
        assert_eq_m512(r, _mm512_setzero_ps());
        let r = _mm512_maskz_cvt_roundph_ps::<_MM_FROUND_NO_EXC>(0b00000000_11111111, a);
        let e = _mm512_setr_ps(
            1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_cvtph_ps() {
        let a = _mm256_setr_epi64x(
            4323521613979991040,
            4323521613979991040,
            4323521613979991040,
            4323521613979991040,
        );
        let r = _mm512_cvtph_ps(a);
        let e = _mm512_set1_ps(1.);
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_cvtph_ps() {
        let a = _mm256_setr_epi64x(
            4323521613979991040,
            4323521613979991040,
            4323521613979991040,
            4323521613979991040,
        );
        let src = _mm512_set1_ps(0.);
        let r = _mm512_mask_cvtph_ps(src, 0, a);
        assert_eq_m512(r, src);
        let r = _mm512_mask_cvtph_ps(src, 0b00000000_11111111, a);
        let e = _mm512_setr_ps(
            1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_cvtph_ps() {
        let a = _mm256_setr_epi64x(
            4323521613979991040,
            4323521613979991040,
            4323521613979991040,
            4323521613979991040,
        );
        let r = _mm512_maskz_cvtph_ps(0, a);
        assert_eq_m512(r, _mm512_setzero_ps());
        let r = _mm512_maskz_cvtph_ps(0b00000000_11111111, a);
        let e = _mm512_setr_ps(
            1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_cvtph_ps() {
        let a = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
        let src = _mm256_set1_ps(0.);
        let r = _mm256_mask_cvtph_ps(src, 0, a);
        assert_eq_m256(r, src);
        let r = _mm256_mask_cvtph_ps(src, 0b11111111, a);
        let e = _mm256_setr_ps(1., 1., 1., 1., 1., 1., 1., 1.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_cvtph_ps() {
        let a = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
        let r = _mm256_maskz_cvtph_ps(0, a);
        assert_eq_m256(r, _mm256_setzero_ps());
        let r = _mm256_maskz_cvtph_ps(0b11111111, a);
        let e = _mm256_setr_ps(1., 1., 1., 1., 1., 1., 1., 1.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_cvtph_ps() {
        let a = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
        let src = _mm_set1_ps(0.);
        let r = _mm_mask_cvtph_ps(src, 0, a);
        assert_eq_m128(r, src);
        let r = _mm_mask_cvtph_ps(src, 0b00001111, a);
        let e = _mm_setr_ps(1., 1., 1., 1.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_cvtph_ps() {
        let a = _mm_setr_epi64x(4323521613979991040, 4323521613979991040);
        let r = _mm_maskz_cvtph_ps(0, a);
        assert_eq_m128(r, _mm_setzero_ps());
        let r = _mm_maskz_cvtph_ps(0b00001111, a);
        let e = _mm_setr_ps(1., 1., 1., 1.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_cvtt_roundps_epi32() {
        let a = _mm512_setr_ps(
            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
        );
        let r = _mm512_cvtt_roundps_epi32::<_MM_FROUND_NO_EXC>(a);
        let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 8, 9, 10, 11, 12, 13, 14, 15);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_cvtt_roundps_epi32() {
        let a = _mm512_setr_ps(
            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
        );
        let src = _mm512_set1_epi32(0);
        let r = _mm512_mask_cvtt_roundps_epi32::<_MM_FROUND_NO_EXC>(src, 0, a);
        assert_eq_m512i(r, src);
        let r = _mm512_mask_cvtt_roundps_epi32::<_MM_FROUND_NO_EXC>(src, 0b00000000_11111111, a);
        let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 0, 0, 0, 0, 0, 0, 0, 0);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_cvtt_roundps_epi32() {
        let a = _mm512_setr_ps(
            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
        );
        let r = _mm512_maskz_cvtt_roundps_epi32::<_MM_FROUND_NO_EXC>(0, a);
        assert_eq_m512i(r, _mm512_setzero_si512());
        let r = _mm512_maskz_cvtt_roundps_epi32::<_MM_FROUND_NO_EXC>(0b00000000_11111111, a);
        let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 0, 0, 0, 0, 0, 0, 0, 0);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_cvtt_roundps_epu32() {
        let a = _mm512_setr_ps(
            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
        );
        let r = _mm512_cvtt_roundps_epu32::<_MM_FROUND_NO_EXC>(a);
        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 9, 10, 11, 12, 13, 14, 15);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_cvtt_roundps_epu32() {
        let a = _mm512_setr_ps(
            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
        );
        let src = _mm512_set1_epi32(0);
        let r = _mm512_mask_cvtt_roundps_epu32::<_MM_FROUND_NO_EXC>(src, 0, a);
        assert_eq_m512i(r, src);
        let r = _mm512_mask_cvtt_roundps_epu32::<_MM_FROUND_NO_EXC>(src, 0b00000000_11111111, a);
        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_cvtt_roundps_epu32() {
        let a = _mm512_setr_ps(
            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
        );
        let r = _mm512_maskz_cvtt_roundps_epu32::<_MM_FROUND_NO_EXC>(0, a);
        assert_eq_m512i(r, _mm512_setzero_si512());
        let r = _mm512_maskz_cvtt_roundps_epu32::<_MM_FROUND_NO_EXC>(0b00000000_11111111, a);
        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_cvttps_epi32() {
        let a = _mm512_setr_ps(
            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
        );
        let r = _mm512_cvttps_epi32(a);
        let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 8, 9, 10, 11, 12, 13, 14, 15);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_cvttps_epi32() {
        let a = _mm512_setr_ps(
            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
        );
        let src = _mm512_set1_epi32(0);
        let r = _mm512_mask_cvttps_epi32(src, 0, a);
        assert_eq_m512i(r, src);
        let r = _mm512_mask_cvttps_epi32(src, 0b00000000_11111111, a);
        let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 0, 0, 0, 0, 0, 0, 0, 0);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_cvttps_epi32() {
        let a = _mm512_setr_ps(
            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
        );
        let r = _mm512_maskz_cvttps_epi32(0, a);
        assert_eq_m512i(r, _mm512_setzero_si512());
        let r = _mm512_maskz_cvttps_epi32(0b00000000_11111111, a);
        let e = _mm512_setr_epi32(0, -1, 2, -3, 4, -5, 6, -7, 0, 0, 0, 0, 0, 0, 0, 0);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_cvttps_epi32() {
        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
        let src = _mm256_set1_epi32(0);
        let r = _mm256_mask_cvttps_epi32(src, 0, a);
        assert_eq_m256i(r, src);
        let r = _mm256_mask_cvttps_epi32(src, 0b11111111, a);
        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_cvttps_epi32() {
        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
        let r = _mm256_maskz_cvttps_epi32(0, a);
        assert_eq_m256i(r, _mm256_setzero_si256());
        let r = _mm256_maskz_cvttps_epi32(0b11111111, a);
        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_cvttps_epi32() {
        let a = _mm_set_ps(12., 13.5, 14., 15.5);
        let src = _mm_set1_epi32(0);
        let r = _mm_mask_cvttps_epi32(src, 0, a);
        assert_eq_m128i(r, src);
        let r = _mm_mask_cvttps_epi32(src, 0b00001111, a);
        let e = _mm_set_epi32(12, 13, 14, 15);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_cvttps_epi32() {
        let a = _mm_set_ps(12., 13.5, 14., 15.5);
        let r = _mm_maskz_cvttps_epi32(0, a);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm_maskz_cvttps_epi32(0b00001111, a);
        let e = _mm_set_epi32(12, 13, 14, 15);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_cvttps_epu32() {
        let a = _mm512_setr_ps(
            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
        );
        let r = _mm512_cvttps_epu32(a);
        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 8, 9, 10, 11, 12, 13, 14, 15);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_cvttps_epu32() {
        let a = _mm512_setr_ps(
            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
        );
        let src = _mm512_set1_epi32(0);
        let r = _mm512_mask_cvttps_epu32(src, 0, a);
        assert_eq_m512i(r, src);
        let r = _mm512_mask_cvttps_epu32(src, 0b00000000_11111111, a);
        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_cvttps_epu32() {
        let a = _mm512_setr_ps(
            0., -1.5, 2., -3.5, 4., -5.5, 6., -7.5, 8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5,
        );
        let r = _mm512_maskz_cvttps_epu32(0, a);
        assert_eq_m512i(r, _mm512_setzero_si512());
        let r = _mm512_maskz_cvttps_epu32(0b00000000_11111111, a);
        let e = _mm512_setr_epi32(0, -1, 2, -1, 4, -1, 6, -1, 0, 0, 0, 0, 0, 0, 0, 0);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_cvttps_epu32() {
        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
        let r = _mm256_cvttps_epu32(a);
        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_cvttps_epu32() {
        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
        let src = _mm256_set1_epi32(0);
        let r = _mm256_mask_cvttps_epu32(src, 0, a);
        assert_eq_m256i(r, src);
        let r = _mm256_mask_cvttps_epu32(src, 0b11111111, a);
        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_cvttps_epu32() {
        let a = _mm256_set_ps(8., 9.5, 10., 11.5, 12., 13.5, 14., 15.5);
        let r = _mm256_maskz_cvttps_epu32(0, a);
        assert_eq_m256i(r, _mm256_setzero_si256());
        let r = _mm256_maskz_cvttps_epu32(0b11111111, a);
        let e = _mm256_set_epi32(8, 9, 10, 11, 12, 13, 14, 15);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_cvttps_epu32() {
        let a = _mm_set_ps(12., 13.5, 14., 15.5);
        let r = _mm_cvttps_epu32(a);
        let e = _mm_set_epi32(12, 13, 14, 15);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_cvttps_epu32() {
        let a = _mm_set_ps(12., 13.5, 14., 15.5);
        let src = _mm_set1_epi32(0);
        let r = _mm_mask_cvttps_epu32(src, 0, a);
        assert_eq_m128i(r, src);
        let r = _mm_mask_cvttps_epu32(src, 0b00001111, a);
        let e = _mm_set_epi32(12, 13, 14, 15);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_cvttps_epu32() {
        let a = _mm_set_ps(12., 13.5, 14., 15.5);
        let r = _mm_maskz_cvttps_epu32(0, a);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm_maskz_cvttps_epu32(0b00001111, a);
        let e = _mm_set_epi32(12, 13, 14, 15);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_i32gather_ps() {
        let mut arr = [0f32; 256];
        for i in 0..256 {
            arr[i] = i as f32;
        }
        // A multiplier of 4 is word-addressing
        #[rustfmt::skip]
        let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
                                      120, 128, 136, 144, 152, 160, 168, 176);
        let r = _mm512_i32gather_ps::<4>(index, arr.as_ptr() as *const u8);
        #[rustfmt::skip]
        assert_eq_m512(r, _mm512_setr_ps(0., 16., 32., 48., 64., 80., 96., 112.,
                                         120., 128., 136., 144., 152., 160., 168., 176.));
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_i32gather_ps() {
        let mut arr = [0f32; 256];
        for i in 0..256 {
            arr[i] = i as f32;
        }
        let src = _mm512_set1_ps(2.);
        let mask = 0b10101010_10101010;
        #[rustfmt::skip]
        let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
                                      120, 128, 136, 144, 152, 160, 168, 176);
        // A multiplier of 4 is word-addressing
        let r = _mm512_mask_i32gather_ps::<4>(src, mask, index, arr.as_ptr() as *const u8);
        #[rustfmt::skip]
        assert_eq_m512(r, _mm512_setr_ps(2., 16., 2., 48., 2., 80., 2., 112.,
                                         2., 128., 2., 144., 2., 160., 2., 176.));
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_i32gather_epi32() {
        let mut arr = [0i32; 256];
        for i in 0..256 {
            arr[i] = i as i32;
        }
        // A multiplier of 4 is word-addressing
        #[rustfmt::skip]
        let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
                                      120, 128, 136, 144, 152, 160, 168, 176);
        let r = _mm512_i32gather_epi32::<4>(index, arr.as_ptr() as *const u8);
        #[rustfmt::skip]
        assert_eq_m512i(r, _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
                                             120, 128, 136, 144, 152, 160, 168, 176));
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_i32gather_epi32() {
        let mut arr = [0i32; 256];
        for i in 0..256 {
            arr[i] = i as i32;
        }
        let src = _mm512_set1_epi32(2);
        let mask = 0b10101010_10101010;
        let index = _mm512_setr_epi32(
            0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240,
        );
        // A multiplier of 4 is word-addressing
        let r = _mm512_mask_i32gather_epi32::<4>(src, mask, index, arr.as_ptr() as *const u8);
        assert_eq_m512i(
            r,
            _mm512_setr_epi32(2, 16, 2, 48, 2, 80, 2, 112, 2, 144, 2, 176, 2, 208, 2, 240),
        );
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_i32scatter_ps() {
        let mut arr = [0f32; 256];
        #[rustfmt::skip]
        let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
                                      128, 144, 160, 176, 192, 208, 224, 240);
        let src = _mm512_setr_ps(
            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
        );
        // A multiplier of 4 is word-addressing
        _mm512_i32scatter_ps::<4>(arr.as_mut_ptr() as *mut u8, index, src);
        let mut expected = [0f32; 256];
        for i in 0..16 {
            expected[i * 16] = (i + 1) as f32;
        }
        assert_eq!(&arr[..], &expected[..],);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_i32scatter_ps() {
        let mut arr = [0f32; 256];
        let mask = 0b10101010_10101010;
        #[rustfmt::skip]
        let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
                                      128, 144, 160, 176, 192, 208, 224, 240);
        let src = _mm512_setr_ps(
            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
        );
        // A multiplier of 4 is word-addressing
        _mm512_mask_i32scatter_ps::<4>(arr.as_mut_ptr() as *mut u8, mask, index, src);
        let mut expected = [0f32; 256];
        for i in 0..8 {
            expected[i * 32 + 16] = 2. * (i + 1) as f32;
        }
        assert_eq!(&arr[..], &expected[..],);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_i32scatter_epi32() {
        let mut arr = [0i32; 256];
        #[rustfmt::skip]

        let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
                                      128, 144, 160, 176, 192, 208, 224, 240);
        let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
        // A multiplier of 4 is word-addressing
        _mm512_i32scatter_epi32::<4>(arr.as_mut_ptr() as *mut u8, index, src);
        let mut expected = [0i32; 256];
        for i in 0..16 {
            expected[i * 16] = (i + 1) as i32;
        }
        assert_eq!(&arr[..], &expected[..],);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_i32scatter_epi32() {
        let mut arr = [0i32; 256];
        let mask = 0b10101010_10101010;
        #[rustfmt::skip]
        let index = _mm512_setr_epi32(0, 16, 32, 48, 64, 80, 96, 112,
                                      128, 144, 160, 176, 192, 208, 224, 240);
        let src = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
        // A multiplier of 4 is word-addressing
        _mm512_mask_i32scatter_epi32::<4>(arr.as_mut_ptr() as *mut u8, mask, index, src);
        let mut expected = [0i32; 256];
        for i in 0..8 {
            expected[i * 32 + 16] = 2 * (i + 1) as i32;
        }
        assert_eq!(&arr[..], &expected[..],);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_cmplt_ps_mask() {
        #[rustfmt::skip]
        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
        let b = _mm512_set1_ps(-1.);
        let m = _mm512_cmplt_ps_mask(a, b);
        assert_eq!(m, 0b00000101_00000101);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_cmplt_ps_mask() {
        #[rustfmt::skip]
        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
        let b = _mm512_set1_ps(-1.);
        let mask = 0b01100110_01100110;
        let r = _mm512_mask_cmplt_ps_mask(mask, a, b);
        assert_eq!(r, 0b00000100_00000100);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_cmpnlt_ps_mask() {
        #[rustfmt::skip]
        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
        let b = _mm512_set1_ps(-1.);
        assert_eq!(_mm512_cmpnlt_ps_mask(a, b), !_mm512_cmplt_ps_mask(a, b));
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_cmpnlt_ps_mask() {
        #[rustfmt::skip]
        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
        let b = _mm512_set1_ps(-1.);
        let mask = 0b01111010_01111010;
        assert_eq!(_mm512_mask_cmpnlt_ps_mask(mask, a, b), 0b01111010_01111010);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_cmpnle_ps_mask() {
        #[rustfmt::skip]
        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
        let b = _mm512_set1_ps(-1.);
        let m = _mm512_cmpnle_ps_mask(b, a);
        assert_eq!(m, 0b00001101_00001101);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_cmpnle_ps_mask() {
        #[rustfmt::skip]
        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
        let b = _mm512_set1_ps(-1.);
        let mask = 0b01100110_01100110;
        let r = _mm512_mask_cmpnle_ps_mask(mask, b, a);
        assert_eq!(r, 0b00000100_00000100);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_cmple_ps_mask() {
        #[rustfmt::skip]
        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
        let b = _mm512_set1_ps(-1.);
        assert_eq!(_mm512_cmple_ps_mask(a, b), 0b00100101_00100101);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_cmple_ps_mask() {
        #[rustfmt::skip]
        let a = _mm512_set_ps(0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.,
                              0., 1., -1., f32::MAX, f32::NAN, f32::MIN, 100., -100.);
        let b = _mm512_set1_ps(-1.);
        let mask = 0b01111010_01111010;
        assert_eq!(_mm512_mask_cmple_ps_mask(mask, a, b), 0b00100000_00100000);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_cmpeq_ps_mask() {
        #[rustfmt::skip]
        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.,
                              0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.);
        #[rustfmt::skip]
        let b = _mm512_set_ps(0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.,
                              0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.);
        let m = _mm512_cmpeq_ps_mask(b, a);
        assert_eq!(m, 0b11001101_11001101);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_cmpeq_ps_mask() {
        #[rustfmt::skip]
        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.,
                              0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.);
        #[rustfmt::skip]
        let b = _mm512_set_ps(0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.,
                              0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.);
        let mask = 0b01111010_01111010;
        let r = _mm512_mask_cmpeq_ps_mask(mask, b, a);
        assert_eq!(r, 0b01001000_01001000);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_cmpneq_ps_mask() {
        #[rustfmt::skip]
        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.,
                              0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.);
        #[rustfmt::skip]
        let b = _mm512_set_ps(0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.,
                              0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.);
        let m = _mm512_cmpneq_ps_mask(b, a);
        assert_eq!(m, 0b00110010_00110010);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_cmpneq_ps_mask() {
        #[rustfmt::skip]
        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.,
                              0., 1., -1., 13., f32::MAX, f32::MIN, f32::NAN, -100.);
        #[rustfmt::skip]
        let b = _mm512_set_ps(0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.,
                              0., 1., 13., 42., f32::MAX, f32::MIN, f32::NAN, -100.);
        let mask = 0b01111010_01111010;
        let r = _mm512_mask_cmpneq_ps_mask(mask, b, a);
        assert_eq!(r, 0b00110010_00110010)
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_cmp_ps_mask() {
        #[rustfmt::skip]
        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.,
                              0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.);
        let b = _mm512_set1_ps(-1.);
        let m = _mm512_cmp_ps_mask::<_CMP_LT_OQ>(a, b);
        assert_eq!(m, 0b00000101_00000101);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_cmp_ps_mask() {
        #[rustfmt::skip]
        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.,
                              0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.);
        let b = _mm512_set1_ps(-1.);
        let mask = 0b01100110_01100110;
        let r = _mm512_mask_cmp_ps_mask::<_CMP_LT_OQ>(mask, a, b);
        assert_eq!(r, 0b00000100_00000100);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_cmp_ps_mask() {
        let a = _mm256_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.);
        let b = _mm256_set1_ps(-1.);
        let m = _mm256_cmp_ps_mask::<_CMP_LT_OQ>(a, b);
        assert_eq!(m, 0b00000101);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_cmp_ps_mask() {
        let a = _mm256_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.);
        let b = _mm256_set1_ps(-1.);
        let mask = 0b01100110;
        let r = _mm256_mask_cmp_ps_mask::<_CMP_LT_OQ>(mask, a, b);
        assert_eq!(r, 0b00000100);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_cmp_ps_mask() {
        let a = _mm_set_ps(0., 1., -1., 13.);
        let b = _mm_set1_ps(1.);
        let m = _mm_cmp_ps_mask::<_CMP_LT_OQ>(a, b);
        assert_eq!(m, 0b00001010);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_cmp_ps_mask() {
        let a = _mm_set_ps(0., 1., -1., 13.);
        let b = _mm_set1_ps(1.);
        let mask = 0b11111111;
        let r = _mm_mask_cmp_ps_mask::<_CMP_LT_OQ>(mask, a, b);
        assert_eq!(r, 0b00001010);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_cmp_round_ps_mask() {
        #[rustfmt::skip]
        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.,
                              0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.);
        let b = _mm512_set1_ps(-1.);
        let m = _mm512_cmp_round_ps_mask::<_CMP_LT_OQ, _MM_FROUND_CUR_DIRECTION>(a, b);
        assert_eq!(m, 0b00000101_00000101);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_cmp_round_ps_mask() {
        #[rustfmt::skip]
        let a = _mm512_set_ps(0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.,
                              0., 1., -1., 13., f32::MAX, f32::MIN, 100., -100.);
        let b = _mm512_set1_ps(-1.);
        let mask = 0b01100110_01100110;
        let r = _mm512_mask_cmp_round_ps_mask::<_CMP_LT_OQ, _MM_FROUND_CUR_DIRECTION>(mask, a, b);
        assert_eq!(r, 0b00000100_00000100);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_cmpord_ps_mask() {
        #[rustfmt::skip]
        let a = _mm512_set_ps(f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, -1., f32::NAN, 0.,
                              f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, 1., f32::NAN, 2.);
        #[rustfmt::skip]
        let b = _mm512_set_ps(f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 0.,
                              f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 2.);
        let m = _mm512_cmpord_ps_mask(a, b);
        assert_eq!(m, 0b00000101_00000101);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_cmpord_ps_mask() {
        #[rustfmt::skip]
        let a = _mm512_set_ps(f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, -1., f32::NAN, 0.,
                              f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, 1., f32::NAN, 2.);
        #[rustfmt::skip]
        let b = _mm512_set_ps(f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 0.,
                              f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 2.);
        let mask = 0b11000011_11000011;
        let m = _mm512_mask_cmpord_ps_mask(mask, a, b);
        assert_eq!(m, 0b00000001_00000001);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_cmpunord_ps_mask() {
        #[rustfmt::skip]
        let a = _mm512_set_ps(f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, -1., f32::NAN, 0.,
                              f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, 1., f32::NAN, 2.);
        #[rustfmt::skip]
        let b = _mm512_set_ps(f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 0.,
                              f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 2.);
        let m = _mm512_cmpunord_ps_mask(a, b);

        assert_eq!(m, 0b11111010_11111010);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_cmpunord_ps_mask() {
        #[rustfmt::skip]
        let a = _mm512_set_ps(f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, -1., f32::NAN, 0.,
                              f32::NAN, f32::MAX, f32::NAN, f32::MIN, f32::NAN, 1., f32::NAN, 2.);
        #[rustfmt::skip]
        let b = _mm512_set_ps(f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 0.,
                              f32::NAN, f32::NAN, f32::NAN, f32::NAN, f32::MIN, f32::MAX, -1., 2.);
        let mask = 0b00001111_00001111;
        let m = _mm512_mask_cmpunord_ps_mask(mask, a, b);
        assert_eq!(m, 0b000001010_00001010);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_cmp_ss_mask() {
        let a = _mm_setr_ps(2., 1., 1., 1.);
        let b = _mm_setr_ps(1., 2., 2., 2.);
        let m = _mm_cmp_ss_mask::<_CMP_GE_OS>(a, b);
        assert_eq!(m, 1);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_cmp_ss_mask() {
        let a = _mm_setr_ps(2., 1., 1., 1.);
        let b = _mm_setr_ps(1., 2., 2., 2.);
        let m = _mm_mask_cmp_ss_mask::<_CMP_GE_OS>(0b10, a, b);
        assert_eq!(m, 0);
        let m = _mm_mask_cmp_ss_mask::<_CMP_GE_OS>(0b1, a, b);
        assert_eq!(m, 1);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_cmp_round_ss_mask() {
        let a = _mm_setr_ps(2., 1., 1., 1.);
        let b = _mm_setr_ps(1., 2., 2., 2.);
        let m = _mm_cmp_round_ss_mask::<_CMP_GE_OS, _MM_FROUND_CUR_DIRECTION>(a, b);
        assert_eq!(m, 1);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_cmp_round_ss_mask() {
        let a = _mm_setr_ps(2., 1., 1., 1.);
        let b = _mm_setr_ps(1., 2., 2., 2.);
        let m = _mm_mask_cmp_round_ss_mask::<_CMP_GE_OS, _MM_FROUND_CUR_DIRECTION>(0b10, a, b);
        assert_eq!(m, 0);
        let m = _mm_mask_cmp_round_ss_mask::<_CMP_GE_OS, _MM_FROUND_CUR_DIRECTION>(0b1, a, b);
        assert_eq!(m, 1);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_cmp_sd_mask() {
        let a = _mm_setr_pd(2., 1.);
        let b = _mm_setr_pd(1., 2.);
        let m = _mm_cmp_sd_mask::<_CMP_GE_OS>(a, b);
        assert_eq!(m, 1);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_cmp_sd_mask() {
        let a = _mm_setr_pd(2., 1.);
        let b = _mm_setr_pd(1., 2.);
        let m = _mm_mask_cmp_sd_mask::<_CMP_GE_OS>(0b10, a, b);
        assert_eq!(m, 0);
        let m = _mm_mask_cmp_sd_mask::<_CMP_GE_OS>(0b1, a, b);
        assert_eq!(m, 1);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_cmp_round_sd_mask() {
        let a = _mm_setr_pd(2., 1.);
        let b = _mm_setr_pd(1., 2.);
        let m = _mm_cmp_round_sd_mask::<_CMP_GE_OS, _MM_FROUND_CUR_DIRECTION>(a, b);
        assert_eq!(m, 1);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_cmp_round_sd_mask() {
        let a = _mm_setr_pd(2., 1.);
        let b = _mm_setr_pd(1., 2.);
        let m = _mm_mask_cmp_round_sd_mask::<_CMP_GE_OS, _MM_FROUND_CUR_DIRECTION>(0b10, a, b);
        assert_eq!(m, 0);
        let m = _mm_mask_cmp_round_sd_mask::<_CMP_GE_OS, _MM_FROUND_CUR_DIRECTION>(0b1, a, b);
        assert_eq!(m, 1);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_cmplt_epu32_mask() {
        #[rustfmt::skip]
        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
        let b = _mm512_set1_epi32(-1);
        let m = _mm512_cmplt_epu32_mask(a, b);
        assert_eq!(m, 0b11001111_11001111);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_cmplt_epu32_mask() {
        #[rustfmt::skip]
        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
        let b = _mm512_set1_epi32(-1);
        let mask = 0b01111010_01111010;
        let r = _mm512_mask_cmplt_epu32_mask(mask, a, b);
        assert_eq!(r, 0b01001010_01001010);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_cmplt_epu32_mask() {
        let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 101, 100, 99);
        let b = _mm256_set1_epi32(1);
        let r = _mm256_cmplt_epu32_mask(a, b);
        assert_eq!(r, 0b10000000);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_cmplt_epu32_mask() {
        let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 101, 100, 99);
        let b = _mm256_set1_epi32(1);
        let mask = 0b11111111;
        let r = _mm256_mask_cmplt_epu32_mask(mask, a, b);
        assert_eq!(r, 0b10000000);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_cmplt_epu32_mask() {
        let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32);
        let b = _mm_set1_epi32(1);
        let r = _mm_cmplt_epu32_mask(a, b);
        assert_eq!(r, 0b00001000);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_cmplt_epu32_mask() {
        let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32);
        let b = _mm_set1_epi32(1);
        let mask = 0b11111111;
        let r = _mm_mask_cmplt_epu32_mask(mask, a, b);
        assert_eq!(r, 0b00001000);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_cmpgt_epu32_mask() {
        #[rustfmt::skip]
        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
        let b = _mm512_set1_epi32(-1);
        let m = _mm512_cmpgt_epu32_mask(b, a);
        assert_eq!(m, 0b11001111_11001111);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_cmpgt_epu32_mask() {
        #[rustfmt::skip]
        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
        let b = _mm512_set1_epi32(-1);
        let mask = 0b01111010_01111010;
        let r = _mm512_mask_cmpgt_epu32_mask(mask, b, a);
        assert_eq!(r, 0b01001010_01001010);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_cmpgt_epu32_mask() {
        let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 99, 100, 101);
        let b = _mm256_set1_epi32(1);
        let r = _mm256_cmpgt_epu32_mask(a, b);
        assert_eq!(r, 0b00111111);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_cmpgt_epu32_mask() {
        let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 99, 100, 101);
        let b = _mm256_set1_epi32(1);
        let mask = 0b11111111;
        let r = _mm256_mask_cmpgt_epu32_mask(mask, a, b);
        assert_eq!(r, 0b00111111);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_cmpgt_epu32_mask() {
        let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32);
        let b = _mm_set1_epi32(1);
        let r = _mm_cmpgt_epu32_mask(a, b);
        assert_eq!(r, 0b00000011);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_cmpgt_epu32_mask() {
        let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32);
        let b = _mm_set1_epi32(1);
        let mask = 0b11111111;
        let r = _mm_mask_cmpgt_epu32_mask(mask, a, b);
        assert_eq!(r, 0b00000011);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_cmple_epu32_mask() {
        #[rustfmt::skip]
        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
        let b = _mm512_set1_epi32(-1);
        assert_eq!(
            _mm512_cmple_epu32_mask(a, b),
            !_mm512_cmpgt_epu32_mask(a, b)
        )
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_cmple_epu32_mask() {
        #[rustfmt::skip]
        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
        let b = _mm512_set1_epi32(-1);
        let mask = 0b01111010_01111010;
        assert_eq!(
            _mm512_mask_cmple_epu32_mask(mask, a, b),
            0b01111010_01111010
        );
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_cmple_epu32_mask() {
        let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 200, 100, 101);
        let b = _mm256_set1_epi32(1);
        let r = _mm256_cmple_epu32_mask(a, b);
        assert_eq!(r, 0b11000000)
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_cmple_epu32_mask() {
        let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 200, 100, 101);
        let b = _mm256_set1_epi32(1);
        let mask = 0b11111111;
        let r = _mm256_mask_cmple_epu32_mask(mask, a, b);
        assert_eq!(r, 0b11000000)
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_cmple_epu32_mask() {
        let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32);
        let b = _mm_set1_epi32(1);
        let r = _mm_cmple_epu32_mask(a, b);
        assert_eq!(r, 0b00001100)
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_cmple_epu32_mask() {
        let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32);
        let b = _mm_set1_epi32(1);
        let mask = 0b11111111;
        let r = _mm_mask_cmple_epu32_mask(mask, a, b);
        assert_eq!(r, 0b00001100)
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_cmpge_epu32_mask() {
        #[rustfmt::skip]
        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
        let b = _mm512_set1_epi32(-1);
        assert_eq!(
            _mm512_cmpge_epu32_mask(a, b),
            !_mm512_cmplt_epu32_mask(a, b)
        )
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_cmpge_epu32_mask() {
        #[rustfmt::skip]
        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
        let b = _mm512_set1_epi32(-1);
        let mask = 0b01111010_01111010;
        assert_eq!(_mm512_mask_cmpge_epu32_mask(mask, a, b), 0b01100000_0110000);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_cmpge_epu32_mask() {
        let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 300, 100, 200);
        let b = _mm256_set1_epi32(1);
        let r = _mm256_cmpge_epu32_mask(a, b);
        assert_eq!(r, 0b01111111)
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_cmpge_epu32_mask() {
        let a = _mm256_set_epi32(0, 1, 2, u32::MAX as i32, i32::MAX, 300, 100, 200);
        let b = _mm256_set1_epi32(1);
        let mask = 0b11111111;
        let r = _mm256_mask_cmpge_epu32_mask(mask, a, b);
        assert_eq!(r, 0b01111111)
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_cmpge_epu32_mask() {
        let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32);
        let b = _mm_set1_epi32(1);
        let r = _mm_cmpge_epu32_mask(a, b);
        assert_eq!(r, 0b00000111)
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_cmpge_epu32_mask() {
        let a = _mm_set_epi32(0, 1, 2, u32::MAX as i32);
        let b = _mm_set1_epi32(1);
        let mask = 0b11111111;
        let r = _mm_mask_cmpge_epu32_mask(mask, a, b);
        assert_eq!(r, 0b00000111)
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_cmpeq_epu32_mask() {
        #[rustfmt::skip]
        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
        #[rustfmt::skip]
        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
        let m = _mm512_cmpeq_epu32_mask(b, a);
        assert_eq!(m, 0b11001111_11001111);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_cmpeq_epu32_mask() {
        #[rustfmt::skip]
        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
        #[rustfmt::skip]
        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
        let mask = 0b01111010_01111010;
        let r = _mm512_mask_cmpeq_epu32_mask(mask, b, a);
        assert_eq!(r, 0b01001010_01001010);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_cmpeq_epu32_mask() {
        let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
        let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
        let m = _mm256_cmpeq_epu32_mask(b, a);
        assert_eq!(m, 0b11001111);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_cmpeq_epu32_mask() {
        let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
        let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
        let mask = 0b01111010;
        let r = _mm256_mask_cmpeq_epu32_mask(mask, b, a);
        assert_eq!(r, 0b01001010);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_cmpeq_epu32_mask() {
        let a = _mm_set_epi32(0, 1, -1, u32::MAX as i32);
        let b = _mm_set_epi32(0, 1, 13, 42);
        let m = _mm_cmpeq_epu32_mask(b, a);
        assert_eq!(m, 0b00001100);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_cmpeq_epu32_mask() {
        let a = _mm_set_epi32(0, 1, -1, u32::MAX as i32);
        let b = _mm_set_epi32(0, 1, 13, 42);
        let mask = 0b11111111;
        let r = _mm_mask_cmpeq_epu32_mask(mask, b, a);
        assert_eq!(r, 0b00001100);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_cmpneq_epu32_mask() {
        #[rustfmt::skip]
        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
        #[rustfmt::skip]
        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
        let m = _mm512_cmpneq_epu32_mask(b, a);
        assert_eq!(m, !_mm512_cmpeq_epu32_mask(b, a));
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_cmpneq_epu32_mask() {
        #[rustfmt::skip]
        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, -100, 100,
                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, -100, 100);
        #[rustfmt::skip]
        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
        let mask = 0b01111010_01111010;
        let r = _mm512_mask_cmpneq_epu32_mask(mask, b, a);
        assert_eq!(r, 0b00110010_00110010);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_cmpneq_epu32_mask() {
        let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, -100, 100);
        let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, -100, 100);
        let r = _mm256_cmpneq_epu32_mask(b, a);
        assert_eq!(r, 0b00110000);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_cmpneq_epu32_mask() {
        let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, -100, 100);
        let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, -100, 100);
        let mask = 0b11111111;
        let r = _mm256_mask_cmpneq_epu32_mask(mask, b, a);
        assert_eq!(r, 0b00110000);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_cmpneq_epu32_mask() {
        let a = _mm_set_epi32(0, 1, -1, u32::MAX as i32);
        let b = _mm_set_epi32(0, 1, 13, 42);
        let r = _mm_cmpneq_epu32_mask(b, a);
        assert_eq!(r, 0b00000011);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_cmpneq_epu32_mask() {
        let a = _mm_set_epi32(0, 1, -1, u32::MAX as i32);
        let b = _mm_set_epi32(0, 1, 13, 42);
        let mask = 0b11111111;
        let r = _mm_mask_cmpneq_epu32_mask(mask, b, a);
        assert_eq!(r, 0b00000011);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_cmp_epu32_mask() {
        #[rustfmt::skip]
        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
        let b = _mm512_set1_epi32(-1);
        let m = _mm512_cmp_epu32_mask::<_MM_CMPINT_LT>(a, b);
        assert_eq!(m, 0b11001111_11001111);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_cmp_epu32_mask() {
        #[rustfmt::skip]
        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
        let b = _mm512_set1_epi32(-1);
        let mask = 0b01111010_01111010;
        let r = _mm512_mask_cmp_epu32_mask::<_MM_CMPINT_LT>(mask, a, b);
        assert_eq!(r, 0b01001010_01001010);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_cmp_epu32_mask() {
        let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
        let b = _mm256_set1_epi32(-1);
        let m = _mm256_cmp_epu32_mask::<_MM_CMPINT_LT>(a, b);
        assert_eq!(m, 0b11001111);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_cmp_epu32_mask() {
        let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
        let b = _mm256_set1_epi32(-1);
        let mask = 0b11111111;
        let r = _mm256_mask_cmp_epu32_mask::<_MM_CMPINT_LT>(mask, a, b);
        assert_eq!(r, 0b11001111);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_cmp_epu32_mask() {
        let a = _mm_set_epi32(0, 1, -1, i32::MAX);
        let b = _mm_set1_epi32(1);
        let m = _mm_cmp_epu32_mask::<_MM_CMPINT_LT>(a, b);
        assert_eq!(m, 0b00001000);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_cmp_epu32_mask() {
        let a = _mm_set_epi32(0, 1, -1, i32::MAX);
        let b = _mm_set1_epi32(1);
        let mask = 0b11111111;
        let r = _mm_mask_cmp_epu32_mask::<_MM_CMPINT_LT>(mask, a, b);
        assert_eq!(r, 0b00001000);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_cmplt_epi32_mask() {
        #[rustfmt::skip]
        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
        let b = _mm512_set1_epi32(-1);
        let m = _mm512_cmplt_epi32_mask(a, b);
        assert_eq!(m, 0b00000101_00000101);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_cmplt_epi32_mask() {
        #[rustfmt::skip]
        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
        let b = _mm512_set1_epi32(-1);
        let mask = 0b01100110_01100110;
        let r = _mm512_mask_cmplt_epi32_mask(mask, a, b);
        assert_eq!(r, 0b00000100_00000100);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_cmplt_epi32_mask() {
        let a = _mm256_set_epi32(0, 1, -1, 101, i32::MAX, i32::MIN, 100, -100);
        let b = _mm256_set1_epi32(-1);
        let r = _mm256_cmplt_epi32_mask(a, b);
        assert_eq!(r, 0b00000101);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_cmplt_epi32_mask() {
        let a = _mm256_set_epi32(0, 1, -1, 101, i32::MAX, i32::MIN, 100, -100);
        let b = _mm256_set1_epi32(-1);
        let mask = 0b11111111;
        let r = _mm256_mask_cmplt_epi32_mask(mask, a, b);
        assert_eq!(r, 0b00000101);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_cmplt_epi32_mask() {
        let a = _mm_set_epi32(i32::MAX, i32::MIN, 100, -100);
        let b = _mm_set1_epi32(-1);
        let r = _mm_cmplt_epi32_mask(a, b);
        assert_eq!(r, 0b00000101);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_cmplt_epi32_mask() {
        let a = _mm_set_epi32(i32::MAX, i32::MIN, 100, -100);
        let b = _mm_set1_epi32(-1);
        let mask = 0b11111111;
        let r = _mm_mask_cmplt_epi32_mask(mask, a, b);
        assert_eq!(r, 0b00000101);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_cmpgt_epi32_mask() {
        #[rustfmt::skip]
        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
                                 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
        let b = _mm512_set1_epi32(-1);
        let m = _mm512_cmpgt_epi32_mask(b, a);
        assert_eq!(m, 0b00000101_00000101);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_cmpgt_epi32_mask() {
        #[rustfmt::skip]
        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
                                 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
        let b = _mm512_set1_epi32(-1);
        let mask = 0b01100110_01100110;
        let r = _mm512_mask_cmpgt_epi32_mask(mask, b, a);
        assert_eq!(r, 0b00000100_00000100);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_cmpgt_epi32_mask() {
        let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
        let b = _mm256_set1_epi32(-1);
        let r = _mm256_cmpgt_epi32_mask(a, b);
        assert_eq!(r, 0b11011010);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_cmpgt_epi32_mask() {
        let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
        let b = _mm256_set1_epi32(-1);
        let mask = 0b11111111;
        let r = _mm256_mask_cmpgt_epi32_mask(mask, a, b);
        assert_eq!(r, 0b11011010);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_cmpgt_epi32_mask() {
        let a = _mm_set_epi32(0, 1, -1, 13);
        let b = _mm_set1_epi32(-1);
        let r = _mm_cmpgt_epi32_mask(a, b);
        assert_eq!(r, 0b00001101);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_cmpgt_epi32_mask() {
        let a = _mm_set_epi32(0, 1, -1, 13);
        let b = _mm_set1_epi32(-1);
        let mask = 0b11111111;
        let r = _mm_mask_cmpgt_epi32_mask(mask, a, b);
        assert_eq!(r, 0b00001101);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_cmple_epi32_mask() {
        #[rustfmt::skip]
        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
        let b = _mm512_set1_epi32(-1);
        assert_eq!(
            _mm512_cmple_epi32_mask(a, b),
            !_mm512_cmpgt_epi32_mask(a, b)
        )
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_cmple_epi32_mask() {
        #[rustfmt::skip]
        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
        let b = _mm512_set1_epi32(-1);
        let mask = 0b01111010_01111010;
        assert_eq!(_mm512_mask_cmple_epi32_mask(mask, a, b), 0b01100000_0110000);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_cmple_epi32_mask() {
        let a = _mm256_set_epi32(0, 1, -1, 200, i32::MAX, i32::MIN, 100, -100);
        let b = _mm256_set1_epi32(-1);
        let r = _mm256_cmple_epi32_mask(a, b);
        assert_eq!(r, 0b00100101)
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_cmple_epi32_mask() {
        let a = _mm256_set_epi32(0, 1, -1, 200, i32::MAX, i32::MIN, 100, -100);
        let b = _mm256_set1_epi32(-1);
        let mask = 0b11111111;
        let r = _mm256_mask_cmple_epi32_mask(mask, a, b);
        assert_eq!(r, 0b00100101)
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_cmple_epi32_mask() {
        let a = _mm_set_epi32(0, 1, -1, 200);
        let b = _mm_set1_epi32(-1);
        let r = _mm_cmple_epi32_mask(a, b);
        assert_eq!(r, 0b00000010)
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_cmple_epi32_mask() {
        let a = _mm_set_epi32(0, 1, -1, 200);
        let b = _mm_set1_epi32(-1);
        let mask = 0b11111111;
        let r = _mm_mask_cmple_epi32_mask(mask, a, b);
        assert_eq!(r, 0b00000010)
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_cmpge_epi32_mask() {
        #[rustfmt::skip]
        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
        let b = _mm512_set1_epi32(-1);
        assert_eq!(
            _mm512_cmpge_epi32_mask(a, b),
            !_mm512_cmplt_epi32_mask(a, b)
        )
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_cmpge_epi32_mask() {
        #[rustfmt::skip]
        let a = _mm512_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100,
                                 0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
        let b = _mm512_set1_epi32(-1);
        let mask = 0b01111010_01111010;
        assert_eq!(
            _mm512_mask_cmpge_epi32_mask(mask, a, b),
            0b01111010_01111010
        );
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_cmpge_epi32_mask() {
        let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
        let b = _mm256_set1_epi32(-1);
        let r = _mm256_cmpge_epi32_mask(a, b);
        assert_eq!(r, 0b11111010)
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_cmpge_epi32_mask() {
        let a = _mm256_set_epi32(0, 1, -1, u32::MAX as i32, i32::MAX, i32::MIN, 100, -100);
        let b = _mm256_set1_epi32(-1);
        let mask = 0b11111111;
        let r = _mm256_mask_cmpge_epi32_mask(mask, a, b);
        assert_eq!(r, 0b11111010)
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_cmpge_epi32_mask() {
        let a = _mm_set_epi32(0, 1, -1, u32::MAX as i32);
        let b = _mm_set1_epi32(-1);
        let r = _mm_cmpge_epi32_mask(a, b);
        assert_eq!(r, 0b00001111)
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_cmpge_epi32_mask() {
        let a = _mm_set_epi32(0, 1, -1, u32::MAX as i32);
        let b = _mm_set1_epi32(-1);
        let mask = 0b11111111;
        let r = _mm_mask_cmpge_epi32_mask(mask, a, b);
        assert_eq!(r, 0b00001111)
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_cmpeq_epi32_mask() {
        #[rustfmt::skip]
        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
                                 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
        #[rustfmt::skip]
        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
        let m = _mm512_cmpeq_epi32_mask(b, a);
        assert_eq!(m, 0b11001111_11001111);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_cmpeq_epi32_mask() {
        #[rustfmt::skip]
        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
                                 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
        #[rustfmt::skip]
        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
        let mask = 0b01111010_01111010;
        let r = _mm512_mask_cmpeq_epi32_mask(mask, b, a);
        assert_eq!(r, 0b01001010_01001010);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_cmpeq_epi32_mask() {
        let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
        let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
        let m = _mm256_cmpeq_epi32_mask(b, a);
        assert_eq!(m, 0b11001111);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_cmpeq_epi32_mask() {
        let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
        let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
        let mask = 0b01111010;
        let r = _mm256_mask_cmpeq_epi32_mask(mask, b, a);
        assert_eq!(r, 0b01001010);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_cmpeq_epi32_mask() {
        let a = _mm_set_epi32(0, 1, -1, 13);
        let b = _mm_set_epi32(0, 1, 13, 42);
        let m = _mm_cmpeq_epi32_mask(b, a);
        assert_eq!(m, 0b00001100);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_cmpeq_epi32_mask() {
        let a = _mm_set_epi32(0, 1, -1, 13);
        let b = _mm_set_epi32(0, 1, 13, 42);
        let mask = 0b11111111;
        let r = _mm_mask_cmpeq_epi32_mask(mask, b, a);
        assert_eq!(r, 0b00001100);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_cmpneq_epi32_mask() {
        #[rustfmt::skip]
        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
                                 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
        #[rustfmt::skip]
        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
        let m = _mm512_cmpneq_epi32_mask(b, a);
        assert_eq!(m, !_mm512_cmpeq_epi32_mask(b, a));
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_cmpneq_epi32_mask() {
        #[rustfmt::skip]
        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, -100, 100,
                                 0, 1, -1, 13, i32::MAX, i32::MIN, -100, 100);
        #[rustfmt::skip]
        let b = _mm512_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100,
                                 0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
        let mask = 0b01111010_01111010;
        let r = _mm512_mask_cmpneq_epi32_mask(mask, b, a);
        assert_eq!(r, 0b00110010_00110010)
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_cmpneq_epi32_mask() {
        let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
        let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
        let m = _mm256_cmpneq_epi32_mask(b, a);
        assert_eq!(m, !_mm256_cmpeq_epi32_mask(b, a));
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_cmpneq_epi32_mask() {
        let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, -100, 100);
        let b = _mm256_set_epi32(0, 1, 13, 42, i32::MAX, i32::MIN, 100, -100);
        let mask = 0b11111111;
        let r = _mm256_mask_cmpneq_epi32_mask(mask, b, a);
        assert_eq!(r, 0b00110011)
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_cmpneq_epi32_mask() {
        let a = _mm_set_epi32(0, 1, -1, 13);
        let b = _mm_set_epi32(0, 1, 13, 42);
        let r = _mm_cmpneq_epi32_mask(b, a);
        assert_eq!(r, 0b00000011)
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_cmpneq_epi32_mask() {
        let a = _mm_set_epi32(0, 1, -1, 13);
        let b = _mm_set_epi32(0, 1, 13, 42);
        let mask = 0b11111111;
        let r = _mm_mask_cmpneq_epi32_mask(mask, b, a);
        assert_eq!(r, 0b00000011)
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_cmp_epi32_mask() {
        #[rustfmt::skip]
        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
                                 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
        let b = _mm512_set1_epi32(-1);
        let m = _mm512_cmp_epi32_mask::<_MM_CMPINT_LT>(a, b);
        assert_eq!(m, 0b00000101_00000101);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_cmp_epi32_mask() {
        #[rustfmt::skip]
        let a = _mm512_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100,
                                 0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
        let b = _mm512_set1_epi32(-1);
        let mask = 0b01100110_01100110;
        let r = _mm512_mask_cmp_epi32_mask::<_MM_CMPINT_LT>(mask, a, b);
        assert_eq!(r, 0b00000100_00000100);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_cmp_epi32_mask() {
        let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
        let b = _mm256_set1_epi32(-1);
        let m = _mm256_cmp_epi32_mask::<_MM_CMPINT_LT>(a, b);
        assert_eq!(m, 0b00000101);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_cmp_epi32_mask() {
        let a = _mm256_set_epi32(0, 1, -1, 13, i32::MAX, i32::MIN, 100, -100);
        let b = _mm256_set1_epi32(-1);
        let mask = 0b01100110;
        let r = _mm256_mask_cmp_epi32_mask::<_MM_CMPINT_LT>(mask, a, b);
        assert_eq!(r, 0b00000100);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_cmp_epi32_mask() {
        let a = _mm_set_epi32(0, 1, -1, 13);
        let b = _mm_set1_epi32(1);
        let m = _mm_cmp_epi32_mask::<_MM_CMPINT_LT>(a, b);
        assert_eq!(m, 0b00001010);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_cmp_epi32_mask() {
        let a = _mm_set_epi32(0, 1, -1, 13);
        let b = _mm_set1_epi32(1);
        let mask = 0b11111111;
        let r = _mm_mask_cmp_epi32_mask::<_MM_CMPINT_LT>(mask, a, b);
        assert_eq!(r, 0b00001010);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_set_epi8() {
        let r = _mm512_set1_epi8(2);
        assert_eq_m512i(
            r,
            _mm512_set_epi8(
                2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
                2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
                2, 2, 2, 2, 2, 2, 2, 2,
            ),
        )
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_set_epi16() {
        let r = _mm512_set1_epi16(2);
        assert_eq_m512i(
            r,
            _mm512_set_epi16(
                2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
                2, 2, 2, 2,
            ),
        )
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_set_epi32() {
        let r = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        assert_eq_m512i(
            r,
            _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
        )
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_setr_epi32() {
        let r = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        assert_eq_m512i(
            r,
            _mm512_setr_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0),
        )
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_set1_epi8() {
        let r = _mm512_set_epi8(
            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
            2, 2, 2, 2, 2, 2,
        );
        assert_eq_m512i(r, _mm512_set1_epi8(2));
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_set1_epi16() {
        let r = _mm512_set_epi16(
            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
            2, 2, 2,
        );
        assert_eq_m512i(r, _mm512_set1_epi16(2));
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_set1_epi32() {
        let r = _mm512_set_epi32(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
        assert_eq_m512i(r, _mm512_set1_epi32(2));
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_setzero_si512() {
        assert_eq_m512i(_mm512_set1_epi32(0), _mm512_setzero_si512());
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_setzero_epi32() {
        assert_eq_m512i(_mm512_set1_epi32(0), _mm512_setzero_epi32());
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_set_ps() {
        let r = _mm512_setr_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        assert_eq_m512(
            r,
            _mm512_set_ps(
                15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
            ),
        )
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_setr_ps() {
        let r = _mm512_set_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        assert_eq_m512(
            r,
            _mm512_setr_ps(
                15., 14., 13., 12., 11., 10., 9., 8., 7., 6., 5., 4., 3., 2., 1., 0.,
            ),
        )
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_set1_ps() {
        #[rustfmt::skip]
        let expected = _mm512_set_ps(2., 2., 2., 2., 2., 2., 2., 2.,
                                     2., 2., 2., 2., 2., 2., 2., 2.);
        assert_eq_m512(expected, _mm512_set1_ps(2.));
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_set4_epi32() {
        let r = _mm512_set_epi32(4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1);
        assert_eq_m512i(r, _mm512_set4_epi32(4, 3, 2, 1));
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_set4_ps() {
        let r = _mm512_set_ps(
            4., 3., 2., 1., 4., 3., 2., 1., 4., 3., 2., 1., 4., 3., 2., 1.,
        );
        assert_eq_m512(r, _mm512_set4_ps(4., 3., 2., 1.));
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_setr4_epi32() {
        let r = _mm512_set_epi32(4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1);
        assert_eq_m512i(r, _mm512_setr4_epi32(1, 2, 3, 4));
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_setr4_ps() {
        let r = _mm512_set_ps(
            4., 3., 2., 1., 4., 3., 2., 1., 4., 3., 2., 1., 4., 3., 2., 1.,
        );
        assert_eq_m512(r, _mm512_setr4_ps(1., 2., 3., 4.));
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_setzero_ps() {
        assert_eq_m512(_mm512_setzero_ps(), _mm512_set1_ps(0.));
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_setzero() {
        assert_eq_m512(_mm512_setzero(), _mm512_set1_ps(0.));
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_loadu_pd() {
        let a = &[4., 3., 2., 5., 8., 9., 64., 50.];
        let p = a.as_ptr();
        let r = _mm512_loadu_pd(black_box(p));
        let e = _mm512_setr_pd(4., 3., 2., 5., 8., 9., 64., 50.);
        assert_eq_m512d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_storeu_pd() {
        let a = _mm512_set1_pd(9.);
        let mut r = _mm512_undefined_pd();
        _mm512_storeu_pd(&mut r as *mut _ as *mut f64, a);
        assert_eq_m512d(r, a);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_loadu_ps() {
        let a = &[
            4., 3., 2., 5., 8., 9., 64., 50., -4., -3., -2., -5., -8., -9., -64., -50.,
        ];
        let p = a.as_ptr();
        let r = _mm512_loadu_ps(black_box(p));
        let e = _mm512_setr_ps(
            4., 3., 2., 5., 8., 9., 64., 50., -4., -3., -2., -5., -8., -9., -64., -50.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_storeu_ps() {
        let a = _mm512_set1_ps(9.);
        let mut r = _mm512_undefined_ps();
        _mm512_storeu_ps(&mut r as *mut _ as *mut f32, a);
        assert_eq_m512(r, a);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_loadu_epi32() {
        let src = _mm512_set1_epi32(42);
        let a = &[1_i32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
        let p = a.as_ptr();
        let m = 0b11101000_11001010;
        let r = _mm512_mask_loadu_epi32(src, m, black_box(p));
        let e = _mm512_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_loadu_epi32() {
        let a = &[1_i32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
        let p = a.as_ptr();
        let m = 0b11101000_11001010;
        let r = _mm512_maskz_loadu_epi32(m, black_box(p));
        let e = _mm512_setr_epi32(0, 2, 0, 4, 0, 0, 7, 8, 0, 0, 0, 12, 0, 14, 15, 16);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_load_epi32() {
        #[repr(align(64))]
        struct Align {
            data: [i32; 16], // 64 bytes
        }
        let src = _mm512_set1_epi32(42);
        let a = Align {
            data: [1_i32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
        };
        let p = a.data.as_ptr();
        let m = 0b11101000_11001010;
        let r = _mm512_mask_load_epi32(src, m, black_box(p));
        let e = _mm512_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_load_epi32() {
        #[repr(align(64))]
        struct Align {
            data: [i32; 16], // 64 bytes
        }
        let a = Align {
            data: [1_i32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
        };
        let p = a.data.as_ptr();
        let m = 0b11101000_11001010;
        let r = _mm512_maskz_load_epi32(m, black_box(p));
        let e = _mm512_setr_epi32(0, 2, 0, 4, 0, 0, 7, 8, 0, 0, 0, 12, 0, 14, 15, 16);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_storeu_epi32() {
        let mut r = [42_i32; 16];
        let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
        let m = 0b11101000_11001010;
        _mm512_mask_storeu_epi32(r.as_mut_ptr(), m, a);
        let e = _mm512_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16);
        assert_eq_m512i(_mm512_loadu_epi32(r.as_ptr()), e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_store_epi32() {
        #[repr(align(64))]
        struct Align {
            data: [i32; 16],
        }
        let mut r = Align { data: [42; 16] };
        let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
        let m = 0b11101000_11001010;
        _mm512_mask_store_epi32(r.data.as_mut_ptr(), m, a);
        let e = _mm512_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8, 42, 42, 42, 12, 42, 14, 15, 16);
        assert_eq_m512i(_mm512_load_epi32(r.data.as_ptr()), e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_loadu_epi64() {
        let src = _mm512_set1_epi64(42);
        let a = &[1_i64, 2, 3, 4, 5, 6, 7, 8];
        let p = a.as_ptr();
        let m = 0b11001010;
        let r = _mm512_mask_loadu_epi64(src, m, black_box(p));
        let e = _mm512_setr_epi64(42, 2, 42, 4, 42, 42, 7, 8);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_loadu_epi64() {
        let a = &[1_i64, 2, 3, 4, 5, 6, 7, 8];
        let p = a.as_ptr();
        let m = 0b11001010;
        let r = _mm512_maskz_loadu_epi64(m, black_box(p));
        let e = _mm512_setr_epi64(0, 2, 0, 4, 0, 0, 7, 8);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_load_epi64() {
        #[repr(align(64))]
        struct Align {
            data: [i64; 8], // 64 bytes
        }
        let src = _mm512_set1_epi64(42);
        let a = Align {
            data: [1_i64, 2, 3, 4, 5, 6, 7, 8],
        };
        let p = a.data.as_ptr();
        let m = 0b11001010;
        let r = _mm512_mask_load_epi64(src, m, black_box(p));
        let e = _mm512_setr_epi64(42, 2, 42, 4, 42, 42, 7, 8);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_load_epi64() {
        #[repr(align(64))]
        struct Align {
            data: [i64; 8], // 64 bytes
        }
        let a = Align {
            data: [1_i64, 2, 3, 4, 5, 6, 7, 8],
        };
        let p = a.data.as_ptr();
        let m = 0b11001010;
        let r = _mm512_maskz_load_epi64(m, black_box(p));
        let e = _mm512_setr_epi64(0, 2, 0, 4, 0, 0, 7, 8);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_storeu_epi64() {
        let mut r = [42_i64; 8];
        let a = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8);
        let m = 0b11001010;
        _mm512_mask_storeu_epi64(r.as_mut_ptr(), m, a);
        let e = _mm512_setr_epi64(42, 2, 42, 4, 42, 42, 7, 8);
        assert_eq_m512i(_mm512_loadu_epi64(r.as_ptr()), e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_store_epi64() {
        #[repr(align(64))]
        struct Align {
            data: [i64; 8],
        }
        let mut r = Align { data: [42; 8] };
        let a = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8);
        let m = 0b11001010;
        let p = r.data.as_mut_ptr();
        _mm512_mask_store_epi64(p, m, a);
        let e = _mm512_setr_epi64(42, 2, 42, 4, 42, 42, 7, 8);
        assert_eq_m512i(_mm512_load_epi64(r.data.as_ptr()), e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_loadu_ps() {
        let src = _mm512_set1_ps(42.0);
        let a = &[
            1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0,
            16.0,
        ];
        let p = a.as_ptr();
        let m = 0b11101000_11001010;
        let r = _mm512_mask_loadu_ps(src, m, black_box(p));
        let e = _mm512_setr_ps(
            42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0, 42.0, 42.0, 42.0, 12.0, 42.0, 14.0, 15.0,
            16.0,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_loadu_ps() {
        let a = &[
            1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0,
            16.0,
        ];
        let p = a.as_ptr();
        let m = 0b11101000_11001010;
        let r = _mm512_maskz_loadu_ps(m, black_box(p));
        let e = _mm512_setr_ps(
            0.0, 2.0, 0.0, 4.0, 0.0, 0.0, 7.0, 8.0, 0.0, 0.0, 0.0, 12.0, 0.0, 14.0, 15.0, 16.0,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_load_ps() {
        #[repr(align(64))]
        struct Align {
            data: [f32; 16], // 64 bytes
        }
        let src = _mm512_set1_ps(42.0);
        let a = Align {
            data: [
                1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0,
                15.0, 16.0,
            ],
        };
        let p = a.data.as_ptr();
        let m = 0b11101000_11001010;
        let r = _mm512_mask_load_ps(src, m, black_box(p));
        let e = _mm512_setr_ps(
            42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0, 42.0, 42.0, 42.0, 12.0, 42.0, 14.0, 15.0,
            16.0,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_load_ps() {
        #[repr(align(64))]
        struct Align {
            data: [f32; 16], // 64 bytes
        }
        let a = Align {
            data: [
                1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0,
                15.0, 16.0,
            ],
        };
        let p = a.data.as_ptr();
        let m = 0b11101000_11001010;
        let r = _mm512_maskz_load_ps(m, black_box(p));
        let e = _mm512_setr_ps(
            0.0, 2.0, 0.0, 4.0, 0.0, 0.0, 7.0, 8.0, 0.0, 0.0, 0.0, 12.0, 0.0, 14.0, 15.0, 16.0,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_storeu_ps() {
        let mut r = [42_f32; 16];
        let a = _mm512_setr_ps(
            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
        );
        let m = 0b11101000_11001010;
        _mm512_mask_storeu_ps(r.as_mut_ptr(), m, a);
        let e = _mm512_setr_ps(
            42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0, 42.0, 42.0, 42.0, 12.0, 42.0, 14.0, 15.0,
            16.0,
        );
        assert_eq_m512(_mm512_loadu_ps(r.as_ptr()), e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_store_ps() {
        #[repr(align(64))]
        struct Align {
            data: [f32; 16],
        }
        let mut r = Align { data: [42.0; 16] };
        let a = _mm512_setr_ps(
            1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
        );
        let m = 0b11101000_11001010;
        _mm512_mask_store_ps(r.data.as_mut_ptr(), m, a);
        let e = _mm512_setr_ps(
            42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0, 42.0, 42.0, 42.0, 12.0, 42.0, 14.0, 15.0,
            16.0,
        );
        assert_eq_m512(_mm512_load_ps(r.data.as_ptr()), e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_loadu_pd() {
        let src = _mm512_set1_pd(42.0);
        let a = &[1.0_f64, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
        let p = a.as_ptr();
        let m = 0b11001010;
        let r = _mm512_mask_loadu_pd(src, m, black_box(p));
        let e = _mm512_setr_pd(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0);
        assert_eq_m512d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_loadu_pd() {
        let a = &[1.0_f64, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
        let p = a.as_ptr();
        let m = 0b11001010;
        let r = _mm512_maskz_loadu_pd(m, black_box(p));
        let e = _mm512_setr_pd(0.0, 2.0, 0.0, 4.0, 0.0, 0.0, 7.0, 8.0);
        assert_eq_m512d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_load_pd() {
        #[repr(align(64))]
        struct Align {
            data: [f64; 8], // 64 bytes
        }
        let src = _mm512_set1_pd(42.0);
        let a = Align {
            data: [1.0_f64, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
        };
        let p = a.data.as_ptr();
        let m = 0b11001010;
        let r = _mm512_mask_load_pd(src, m, black_box(p));
        let e = _mm512_setr_pd(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0);
        assert_eq_m512d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_load_pd() {
        #[repr(align(64))]
        struct Align {
            data: [f64; 8], // 64 bytes
        }
        let a = Align {
            data: [1.0_f64, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
        };
        let p = a.data.as_ptr();
        let m = 0b11001010;
        let r = _mm512_maskz_load_pd(m, black_box(p));
        let e = _mm512_setr_pd(0.0, 2.0, 0.0, 4.0, 0.0, 0.0, 7.0, 8.0);
        assert_eq_m512d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_storeu_pd() {
        let mut r = [42_f64; 8];
        let a = _mm512_setr_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
        let m = 0b11001010;
        _mm512_mask_storeu_pd(r.as_mut_ptr(), m, a);
        let e = _mm512_setr_pd(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0);
        assert_eq_m512d(_mm512_loadu_pd(r.as_ptr()), e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_store_pd() {
        #[repr(align(64))]
        struct Align {
            data: [f64; 8],
        }
        let mut r = Align { data: [42.0; 8] };
        let a = _mm512_setr_pd(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
        let m = 0b11001010;
        _mm512_mask_store_pd(r.data.as_mut_ptr(), m, a);
        let e = _mm512_setr_pd(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0);
        assert_eq_m512d(_mm512_load_pd(r.data.as_ptr()), e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_loadu_epi32() {
        let src = _mm256_set1_epi32(42);
        let a = &[1_i32, 2, 3, 4, 5, 6, 7, 8];
        let p = a.as_ptr();
        let m = 0b11001010;
        let r = _mm256_mask_loadu_epi32(src, m, black_box(p));
        let e = _mm256_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_loadu_epi32() {
        let a = &[1_i32, 2, 3, 4, 5, 6, 7, 8];
        let p = a.as_ptr();
        let m = 0b11001010;
        let r = _mm256_maskz_loadu_epi32(m, black_box(p));
        let e = _mm256_setr_epi32(0, 2, 0, 4, 0, 0, 7, 8);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_load_epi32() {
        #[repr(align(32))]
        struct Align {
            data: [i32; 8], // 32 bytes
        }
        let src = _mm256_set1_epi32(42);
        let a = Align {
            data: [1_i32, 2, 3, 4, 5, 6, 7, 8],
        };
        let p = a.data.as_ptr();
        let m = 0b11001010;
        let r = _mm256_mask_load_epi32(src, m, black_box(p));
        let e = _mm256_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_load_epi32() {
        #[repr(align(32))]
        struct Align {
            data: [i32; 8], // 32 bytes
        }
        let a = Align {
            data: [1_i32, 2, 3, 4, 5, 6, 7, 8],
        };
        let p = a.data.as_ptr();
        let m = 0b11001010;
        let r = _mm256_maskz_load_epi32(m, black_box(p));
        let e = _mm256_setr_epi32(0, 2, 0, 4, 0, 0, 7, 8);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_storeu_epi32() {
        let mut r = [42_i32; 8];
        let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
        let m = 0b11001010;
        _mm256_mask_storeu_epi32(r.as_mut_ptr(), m, a);
        let e = _mm256_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8);
        assert_eq_m256i(_mm256_loadu_epi32(r.as_ptr()), e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_store_epi32() {
        #[repr(align(64))]
        struct Align {
            data: [i32; 8],
        }
        let mut r = Align { data: [42; 8] };
        let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
        let m = 0b11001010;
        _mm256_mask_store_epi32(r.data.as_mut_ptr(), m, a);
        let e = _mm256_setr_epi32(42, 2, 42, 4, 42, 42, 7, 8);
        assert_eq_m256i(_mm256_load_epi32(r.data.as_ptr()), e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_loadu_epi64() {
        let src = _mm256_set1_epi64x(42);
        let a = &[1_i64, 2, 3, 4];
        let p = a.as_ptr();
        let m = 0b1010;
        let r = _mm256_mask_loadu_epi64(src, m, black_box(p));
        let e = _mm256_setr_epi64x(42, 2, 42, 4);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_loadu_epi64() {
        let a = &[1_i64, 2, 3, 4];
        let p = a.as_ptr();
        let m = 0b1010;
        let r = _mm256_maskz_loadu_epi64(m, black_box(p));
        let e = _mm256_setr_epi64x(0, 2, 0, 4);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_load_epi64() {
        #[repr(align(32))]
        struct Align {
            data: [i64; 4], // 32 bytes
        }
        let src = _mm256_set1_epi64x(42);
        let a = Align {
            data: [1_i64, 2, 3, 4],
        };
        let p = a.data.as_ptr();
        let m = 0b1010;
        let r = _mm256_mask_load_epi64(src, m, black_box(p));
        let e = _mm256_setr_epi64x(42, 2, 42, 4);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_load_epi64() {
        #[repr(align(32))]
        struct Align {
            data: [i64; 4], // 32 bytes
        }
        let a = Align {
            data: [1_i64, 2, 3, 4],
        };
        let p = a.data.as_ptr();
        let m = 0b1010;
        let r = _mm256_maskz_load_epi64(m, black_box(p));
        let e = _mm256_setr_epi64x(0, 2, 0, 4);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_storeu_epi64() {
        let mut r = [42_i64; 4];
        let a = _mm256_setr_epi64x(1, 2, 3, 4);
        let m = 0b1010;
        _mm256_mask_storeu_epi64(r.as_mut_ptr(), m, a);
        let e = _mm256_setr_epi64x(42, 2, 42, 4);
        assert_eq_m256i(_mm256_loadu_epi64(r.as_ptr()), e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_store_epi64() {
        #[repr(align(32))]
        struct Align {
            data: [i64; 4],
        }
        let mut r = Align { data: [42; 4] };
        let a = _mm256_setr_epi64x(1, 2, 3, 4);
        let m = 0b1010;
        _mm256_mask_store_epi64(r.data.as_mut_ptr(), m, a);
        let e = _mm256_setr_epi64x(42, 2, 42, 4);
        assert_eq_m256i(_mm256_load_epi64(r.data.as_ptr()), e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_loadu_ps() {
        let src = _mm256_set1_ps(42.0);
        let a = &[1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
        let p = a.as_ptr();
        let m = 0b11001010;
        let r = _mm256_mask_loadu_ps(src, m, black_box(p));
        let e = _mm256_setr_ps(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_loadu_ps() {
        let a = &[1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
        let p = a.as_ptr();
        let m = 0b11001010;
        let r = _mm256_maskz_loadu_ps(m, black_box(p));
        let e = _mm256_setr_ps(0.0, 2.0, 0.0, 4.0, 0.0, 0.0, 7.0, 8.0);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_load_ps() {
        #[repr(align(32))]
        struct Align {
            data: [f32; 8], // 32 bytes
        }
        let src = _mm256_set1_ps(42.0);
        let a = Align {
            data: [1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
        };
        let p = a.data.as_ptr();
        let m = 0b11001010;
        let r = _mm256_mask_load_ps(src, m, black_box(p));
        let e = _mm256_setr_ps(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_load_ps() {
        #[repr(align(32))]
        struct Align {
            data: [f32; 8], // 32 bytes
        }
        let a = Align {
            data: [1.0_f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
        };
        let p = a.data.as_ptr();
        let m = 0b11001010;
        let r = _mm256_maskz_load_ps(m, black_box(p));
        let e = _mm256_setr_ps(0.0, 2.0, 0.0, 4.0, 0.0, 0.0, 7.0, 8.0);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_storeu_ps() {
        let mut r = [42_f32; 8];
        let a = _mm256_setr_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
        let m = 0b11001010;
        _mm256_mask_storeu_ps(r.as_mut_ptr(), m, a);
        let e = _mm256_setr_ps(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0);
        assert_eq_m256(_mm256_loadu_ps(r.as_ptr()), e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_store_ps() {
        #[repr(align(32))]
        struct Align {
            data: [f32; 8],
        }
        let mut r = Align { data: [42.0; 8] };
        let a = _mm256_setr_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0);
        let m = 0b11001010;
        _mm256_mask_store_ps(r.data.as_mut_ptr(), m, a);
        let e = _mm256_setr_ps(42.0, 2.0, 42.0, 4.0, 42.0, 42.0, 7.0, 8.0);
        assert_eq_m256(_mm256_load_ps(r.data.as_ptr()), e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_loadu_pd() {
        let src = _mm256_set1_pd(42.0);
        let a = &[1.0_f64, 2.0, 3.0, 4.0];
        let p = a.as_ptr();
        let m = 0b1010;
        let r = _mm256_mask_loadu_pd(src, m, black_box(p));
        let e = _mm256_setr_pd(42.0, 2.0, 42.0, 4.0);
        assert_eq_m256d(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_loadu_pd() {
        let a = &[1.0_f64, 2.0, 3.0, 4.0];
        let p = a.as_ptr();
        let m = 0b1010;
        let r = _mm256_maskz_loadu_pd(m, black_box(p));
        let e = _mm256_setr_pd(0.0, 2.0, 0.0, 4.0);
        assert_eq_m256d(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_load_pd() {
        #[repr(align(32))]
        struct Align {
            data: [f64; 4], // 32 bytes
        }
        let src = _mm256_set1_pd(42.0);
        let a = Align {
            data: [1.0_f64, 2.0, 3.0, 4.0],
        };
        let p = a.data.as_ptr();
        let m = 0b1010;
        let r = _mm256_mask_load_pd(src, m, black_box(p));
        let e = _mm256_setr_pd(42.0, 2.0, 42.0, 4.0);
        assert_eq_m256d(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_load_pd() {
        #[repr(align(32))]
        struct Align {
            data: [f64; 4], // 32 bytes
        }
        let a = Align {
            data: [1.0_f64, 2.0, 3.0, 4.0],
        };
        let p = a.data.as_ptr();
        let m = 0b1010;
        let r = _mm256_maskz_load_pd(m, black_box(p));
        let e = _mm256_setr_pd(0.0, 2.0, 0.0, 4.0);
        assert_eq_m256d(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_storeu_pd() {
        let mut r = [42_f64; 4];
        let a = _mm256_setr_pd(1.0, 2.0, 3.0, 4.0);
        let m = 0b1010;
        _mm256_mask_storeu_pd(r.as_mut_ptr(), m, a);
        let e = _mm256_setr_pd(42.0, 2.0, 42.0, 4.0);
        assert_eq_m256d(_mm256_loadu_pd(r.as_ptr()), e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_store_pd() {
        #[repr(align(32))]
        struct Align {
            data: [f64; 4],
        }
        let mut r = Align { data: [42.0; 4] };
        let a = _mm256_setr_pd(1.0, 2.0, 3.0, 4.0);
        let m = 0b1010;
        _mm256_mask_store_pd(r.data.as_mut_ptr(), m, a);
        let e = _mm256_setr_pd(42.0, 2.0, 42.0, 4.0);
        assert_eq_m256d(_mm256_load_pd(r.data.as_ptr()), e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_loadu_epi32() {
        let src = _mm_set1_epi32(42);
        let a = &[1_i32, 2, 3, 4];
        let p = a.as_ptr();
        let m = 0b1010;
        let r = _mm_mask_loadu_epi32(src, m, black_box(p));
        let e = _mm_setr_epi32(42, 2, 42, 4);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_loadu_epi32() {
        let a = &[1_i32, 2, 3, 4];
        let p = a.as_ptr();
        let m = 0b1010;
        let r = _mm_maskz_loadu_epi32(m, black_box(p));
        let e = _mm_setr_epi32(0, 2, 0, 4);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_load_epi32() {
        #[repr(align(16))]
        struct Align {
            data: [i32; 4], // 32 bytes
        }
        let src = _mm_set1_epi32(42);
        let a = Align {
            data: [1_i32, 2, 3, 4],
        };
        let p = a.data.as_ptr();
        let m = 0b1010;
        let r = _mm_mask_load_epi32(src, m, black_box(p));
        let e = _mm_setr_epi32(42, 2, 42, 4);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_load_epi32() {
        #[repr(align(16))]
        struct Align {
            data: [i32; 4], // 16 bytes
        }
        let a = Align {
            data: [1_i32, 2, 3, 4],
        };
        let p = a.data.as_ptr();
        let m = 0b1010;
        let r = _mm_maskz_load_epi32(m, black_box(p));
        let e = _mm_setr_epi32(0, 2, 0, 4);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_storeu_epi32() {
        let mut r = [42_i32; 4];
        let a = _mm_setr_epi32(1, 2, 3, 4);
        let m = 0b1010;
        _mm_mask_storeu_epi32(r.as_mut_ptr(), m, a);
        let e = _mm_setr_epi32(42, 2, 42, 4);
        assert_eq_m128i(_mm_loadu_epi32(r.as_ptr()), e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_store_epi32() {
        #[repr(align(16))]
        struct Align {
            data: [i32; 4], // 16 bytes
        }
        let mut r = Align { data: [42; 4] };
        let a = _mm_setr_epi32(1, 2, 3, 4);
        let m = 0b1010;
        _mm_mask_store_epi32(r.data.as_mut_ptr(), m, a);
        let e = _mm_setr_epi32(42, 2, 42, 4);
        assert_eq_m128i(_mm_load_epi32(r.data.as_ptr()), e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_loadu_epi64() {
        let src = _mm_set1_epi64x(42);
        let a = &[1_i64, 2];
        let p = a.as_ptr();
        let m = 0b10;
        let r = _mm_mask_loadu_epi64(src, m, black_box(p));
        let e = _mm_setr_epi64x(42, 2);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_loadu_epi64() {
        let a = &[1_i64, 2];
        let p = a.as_ptr();
        let m = 0b10;
        let r = _mm_maskz_loadu_epi64(m, black_box(p));
        let e = _mm_setr_epi64x(0, 2);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_load_epi64() {
        #[repr(align(16))]
        struct Align {
            data: [i64; 2], // 16 bytes
        }
        let src = _mm_set1_epi64x(42);
        let a = Align { data: [1_i64, 2] };
        let p = a.data.as_ptr();
        let m = 0b10;
        let r = _mm_mask_load_epi64(src, m, black_box(p));
        let e = _mm_setr_epi64x(42, 2);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_load_epi64() {
        #[repr(align(16))]
        struct Align {
            data: [i64; 2], // 16 bytes
        }
        let a = Align { data: [1_i64, 2] };
        let p = a.data.as_ptr();
        let m = 0b10;
        let r = _mm_maskz_load_epi64(m, black_box(p));
        let e = _mm_setr_epi64x(0, 2);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_storeu_epi64() {
        let mut r = [42_i64; 2];
        let a = _mm_setr_epi64x(1, 2);
        let m = 0b10;
        _mm_mask_storeu_epi64(r.as_mut_ptr(), m, a);
        let e = _mm_setr_epi64x(42, 2);
        assert_eq_m128i(_mm_loadu_epi64(r.as_ptr()), e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_store_epi64() {
        #[repr(align(16))]
        struct Align {
            data: [i64; 2], // 16 bytes
        }
        let mut r = Align { data: [42; 2] };
        let a = _mm_setr_epi64x(1, 2);
        let m = 0b10;
        _mm_mask_store_epi64(r.data.as_mut_ptr(), m, a);
        let e = _mm_setr_epi64x(42, 2);
        assert_eq_m128i(_mm_load_epi64(r.data.as_ptr()), e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_loadu_ps() {
        let src = _mm_set1_ps(42.0);
        let a = &[1.0_f32, 2.0, 3.0, 4.0];
        let p = a.as_ptr();
        let m = 0b1010;
        let r = _mm_mask_loadu_ps(src, m, black_box(p));
        let e = _mm_setr_ps(42.0, 2.0, 42.0, 4.0);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_loadu_ps() {
        let a = &[1.0_f32, 2.0, 3.0, 4.0];
        let p = a.as_ptr();
        let m = 0b1010;
        let r = _mm_maskz_loadu_ps(m, black_box(p));
        let e = _mm_setr_ps(0.0, 2.0, 0.0, 4.0);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_load_ps() {
        #[repr(align(16))]
        struct Align {
            data: [f32; 4], // 16 bytes
        }
        let src = _mm_set1_ps(42.0);
        let a = Align {
            data: [1.0_f32, 2.0, 3.0, 4.0],
        };
        let p = a.data.as_ptr();
        let m = 0b1010;
        let r = _mm_mask_load_ps(src, m, black_box(p));
        let e = _mm_setr_ps(42.0, 2.0, 42.0, 4.0);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_load_ps() {
        #[repr(align(16))]
        struct Align {
            data: [f32; 4], // 16 bytes
        }
        let a = Align {
            data: [1.0_f32, 2.0, 3.0, 4.0],
        };
        let p = a.data.as_ptr();
        let m = 0b1010;
        let r = _mm_maskz_load_ps(m, black_box(p));
        let e = _mm_setr_ps(0.0, 2.0, 0.0, 4.0);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_storeu_ps() {
        let mut r = [42_f32; 4];
        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
        let m = 0b1010;
        _mm_mask_storeu_ps(r.as_mut_ptr(), m, a);
        let e = _mm_setr_ps(42.0, 2.0, 42.0, 4.0);
        assert_eq_m128(_mm_loadu_ps(r.as_ptr()), e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_store_ps() {
        #[repr(align(16))]
        struct Align {
            data: [f32; 4], // 16 bytes
        }
        let mut r = Align { data: [42.0; 4] };
        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
        let m = 0b1010;
        _mm_mask_store_ps(r.data.as_mut_ptr(), m, a);
        let e = _mm_setr_ps(42.0, 2.0, 42.0, 4.0);
        assert_eq_m128(_mm_load_ps(r.data.as_ptr()), e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_loadu_pd() {
        let src = _mm_set1_pd(42.0);
        let a = &[1.0_f64, 2.0];
        let p = a.as_ptr();
        let m = 0b10;
        let r = _mm_mask_loadu_pd(src, m, black_box(p));
        let e = _mm_setr_pd(42.0, 2.0);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_loadu_pd() {
        let a = &[1.0_f64, 2.0];
        let p = a.as_ptr();
        let m = 0b10;
        let r = _mm_maskz_loadu_pd(m, black_box(p));
        let e = _mm_setr_pd(0.0, 2.0);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_load_pd() {
        #[repr(align(16))]
        struct Align {
            data: [f64; 2], // 16 bytes
        }
        let src = _mm_set1_pd(42.0);
        let a = Align {
            data: [1.0_f64, 2.0],
        };
        let p = a.data.as_ptr();
        let m = 0b10;
        let r = _mm_mask_load_pd(src, m, black_box(p));
        let e = _mm_setr_pd(42.0, 2.0);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_load_pd() {
        #[repr(align(16))]
        struct Align {
            data: [f64; 2], // 16 bytes
        }
        let a = Align {
            data: [1.0_f64, 2.0],
        };
        let p = a.data.as_ptr();
        let m = 0b10;
        let r = _mm_maskz_load_pd(m, black_box(p));
        let e = _mm_setr_pd(0.0, 2.0);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_storeu_pd() {
        let mut r = [42_f64; 2];
        let a = _mm_setr_pd(1.0, 2.0);
        let m = 0b10;
        _mm_mask_storeu_pd(r.as_mut_ptr(), m, a);
        let e = _mm_setr_pd(42.0, 2.0);
        assert_eq_m128d(_mm_loadu_pd(r.as_ptr()), e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_store_pd() {
        #[repr(align(16))]
        struct Align {
            data: [f64; 2], // 16 bytes
        }
        let mut r = Align { data: [42.0; 2] };
        let a = _mm_setr_pd(1.0, 2.0);
        let m = 0b10;
        _mm_mask_store_pd(r.data.as_mut_ptr(), m, a);
        let e = _mm_setr_pd(42.0, 2.0);
        assert_eq_m128d(_mm_load_pd(r.data.as_ptr()), e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_setr_pd() {
        let r = _mm512_set_pd(0., 1., 2., 3., 4., 5., 6., 7.);
        assert_eq_m512d(r, _mm512_setr_pd(7., 6., 5., 4., 3., 2., 1., 0.));
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_set_pd() {
        let r = _mm512_setr_pd(0., 1., 2., 3., 4., 5., 6., 7.);
        assert_eq_m512d(r, _mm512_set_pd(7., 6., 5., 4., 3., 2., 1., 0.));
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_rol_epi32() {
        let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
        let r = _mm512_rol_epi32::<1>(a);
        let e = _mm512_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_rol_epi32() {
        let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
        let r = _mm512_mask_rol_epi32::<1>(a, 0, a);
        assert_eq_m512i(r, a);
        let r = _mm512_mask_rol_epi32::<1>(a, 0b11111111_11111111, a);
        let e = _mm512_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_rol_epi32() {
        let a = _mm512_set_epi32(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 << 31);
        let r = _mm512_maskz_rol_epi32::<1>(0, a);
        assert_eq_m512i(r, _mm512_setzero_si512());
        let r = _mm512_maskz_rol_epi32::<1>(0b00000000_11111111, a);
        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 1 << 0);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_rol_epi32() {
        let a = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
        let r = _mm256_rol_epi32::<1>(a);
        let e = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_rol_epi32() {
        let a = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
        let r = _mm256_mask_rol_epi32::<1>(a, 0, a);
        assert_eq_m256i(r, a);
        let r = _mm256_mask_rol_epi32::<1>(a, 0b11111111, a);
        let e = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_rol_epi32() {
        let a = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
        let r = _mm256_maskz_rol_epi32::<1>(0, a);
        assert_eq_m256i(r, _mm256_setzero_si256());
        let r = _mm256_maskz_rol_epi32::<1>(0b11111111, a);
        let e = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_rol_epi32() {
        let a = _mm_set_epi32(1 << 31, 1, 1, 1);
        let r = _mm_rol_epi32::<1>(a);
        let e = _mm_set_epi32(1 << 0, 2, 2, 2);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_rol_epi32() {
        let a = _mm_set_epi32(1 << 31, 1, 1, 1);
        let r = _mm_mask_rol_epi32::<1>(a, 0, a);
        assert_eq_m128i(r, a);
        let r = _mm_mask_rol_epi32::<1>(a, 0b00001111, a);
        let e = _mm_set_epi32(1 << 0, 2, 2, 2);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_rol_epi32() {
        let a = _mm_set_epi32(1 << 31, 1, 1, 1);
        let r = _mm_maskz_rol_epi32::<1>(0, a);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm_maskz_rol_epi32::<1>(0b00001111, a);
        let e = _mm_set_epi32(1 << 0, 2, 2, 2);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_ror_epi32() {
        let a = _mm512_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
        let r = _mm512_ror_epi32::<1>(a);
        let e = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_ror_epi32() {
        let a = _mm512_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
        let r = _mm512_mask_ror_epi32::<1>(a, 0, a);
        assert_eq_m512i(r, a);
        let r = _mm512_mask_ror_epi32::<1>(a, 0b11111111_11111111, a);
        let e = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_ror_epi32() {
        let a = _mm512_set_epi32(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1 << 0);
        let r = _mm512_maskz_ror_epi32::<1>(0, a);
        assert_eq_m512i(r, _mm512_setzero_si512());
        let r = _mm512_maskz_ror_epi32::<1>(0b00000000_11111111, a);
        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1 << 31);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_ror_epi32() {
        let a = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
        let r = _mm256_ror_epi32::<1>(a);
        let e = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_ror_epi32() {
        let a = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
        let r = _mm256_mask_ror_epi32::<1>(a, 0, a);
        assert_eq_m256i(r, a);
        let r = _mm256_mask_ror_epi32::<1>(a, 0b11111111, a);
        let e = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_ror_epi32() {
        let a = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
        let r = _mm256_maskz_ror_epi32::<1>(0, a);
        assert_eq_m256i(r, _mm256_setzero_si256());
        let r = _mm256_maskz_ror_epi32::<1>(0b11111111, a);
        let e = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_ror_epi32() {
        let a = _mm_set_epi32(1 << 0, 2, 2, 2);
        let r = _mm_ror_epi32::<1>(a);
        let e = _mm_set_epi32(1 << 31, 1, 1, 1);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_ror_epi32() {
        let a = _mm_set_epi32(1 << 0, 2, 2, 2);
        let r = _mm_mask_ror_epi32::<1>(a, 0, a);
        assert_eq_m128i(r, a);
        let r = _mm_mask_ror_epi32::<1>(a, 0b00001111, a);
        let e = _mm_set_epi32(1 << 31, 1, 1, 1);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_ror_epi32() {
        let a = _mm_set_epi32(1 << 0, 2, 2, 2);
        let r = _mm_maskz_ror_epi32::<1>(0, a);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm_maskz_ror_epi32::<1>(0b00001111, a);
        let e = _mm_set_epi32(1 << 31, 1, 1, 1);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_slli_epi32() {
        let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
        let r = _mm512_slli_epi32::<1>(a);
        let e = _mm512_set_epi32(0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_slli_epi32() {
        let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
        let r = _mm512_mask_slli_epi32::<1>(a, 0, a);
        assert_eq_m512i(r, a);
        let r = _mm512_mask_slli_epi32::<1>(a, 0b11111111_11111111, a);
        let e = _mm512_set_epi32(0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_slli_epi32() {
        let a = _mm512_set_epi32(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 << 31);
        let r = _mm512_maskz_slli_epi32::<1>(0, a);
        assert_eq_m512i(r, _mm512_setzero_si512());
        let r = _mm512_maskz_slli_epi32::<1>(0b00000000_11111111, a);
        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 0);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_slli_epi32() {
        let a = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
        let r = _mm256_mask_slli_epi32::<1>(a, 0, a);
        assert_eq_m256i(r, a);
        let r = _mm256_mask_slli_epi32::<1>(a, 0b11111111, a);
        let e = _mm256_set_epi32(0, 2, 2, 2, 2, 2, 2, 2);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_slli_epi32() {
        let a = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
        let r = _mm256_maskz_slli_epi32::<1>(0, a);
        assert_eq_m256i(r, _mm256_setzero_si256());
        let r = _mm256_maskz_slli_epi32::<1>(0b11111111, a);
        let e = _mm256_set_epi32(0, 2, 2, 2, 2, 2, 2, 2);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_slli_epi32() {
        let a = _mm_set_epi32(1 << 31, 1, 1, 1);
        let r = _mm_mask_slli_epi32::<1>(a, 0, a);
        assert_eq_m128i(r, a);
        let r = _mm_mask_slli_epi32::<1>(a, 0b00001111, a);
        let e = _mm_set_epi32(0, 2, 2, 2);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_slli_epi32() {
        let a = _mm_set_epi32(1 << 31, 1, 1, 1);
        let r = _mm_maskz_slli_epi32::<1>(0, a);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm_maskz_slli_epi32::<1>(0b00001111, a);
        let e = _mm_set_epi32(0, 2, 2, 2);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_srli_epi32() {
        let a = _mm512_set_epi32(0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
        let r = _mm512_srli_epi32::<1>(a);
        let e = _mm512_set_epi32(0 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_srli_epi32() {
        let a = _mm512_set_epi32(0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
        let r = _mm512_mask_srli_epi32::<1>(a, 0, a);
        assert_eq_m512i(r, a);
        let r = _mm512_mask_srli_epi32::<1>(a, 0b11111111_11111111, a);
        let e = _mm512_set_epi32(0 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_srli_epi32() {
        let a = _mm512_set_epi32(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0);
        let r = _mm512_maskz_srli_epi32::<1>(0, a);
        assert_eq_m512i(r, _mm512_setzero_si512());
        let r = _mm512_maskz_srli_epi32::<1>(0b00000000_11111111, a);
        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0 << 31);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_srli_epi32() {
        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
        let r = _mm256_mask_srli_epi32::<1>(a, 0, a);
        assert_eq_m256i(r, a);
        let r = _mm256_mask_srli_epi32::<1>(a, 0b11111111, a);
        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_srli_epi32() {
        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
        let r = _mm256_maskz_srli_epi32::<1>(0, a);
        assert_eq_m256i(r, _mm256_setzero_si256());
        let r = _mm256_maskz_srli_epi32::<1>(0b11111111, a);
        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_srli_epi32() {
        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
        let r = _mm_mask_srli_epi32::<1>(a, 0, a);
        assert_eq_m128i(r, a);
        let r = _mm_mask_srli_epi32::<1>(a, 0b00001111, a);
        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_srli_epi32() {
        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
        let r = _mm_maskz_srli_epi32::<1>(0, a);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm_maskz_srli_epi32::<1>(0b00001111, a);
        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_rolv_epi32() {
        let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
        let b = _mm512_set1_epi32(1);
        let r = _mm512_rolv_epi32(a, b);
        let e = _mm512_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_rolv_epi32() {
        let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
        let b = _mm512_set1_epi32(1);
        let r = _mm512_mask_rolv_epi32(a, 0, a, b);
        assert_eq_m512i(r, a);
        let r = _mm512_mask_rolv_epi32(a, 0b11111111_11111111, a, b);
        let e = _mm512_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_rolv_epi32() {
        let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 << 31);
        let b = _mm512_set1_epi32(1);
        let r = _mm512_maskz_rolv_epi32(0, a, b);
        assert_eq_m512i(r, _mm512_setzero_si512());
        let r = _mm512_maskz_rolv_epi32(0b00000000_11111111, a, b);
        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 1 << 0);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_rolv_epi32() {
        let a = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
        let b = _mm256_set1_epi32(1);
        let r = _mm256_rolv_epi32(a, b);
        let e = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_rolv_epi32() {
        let a = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
        let b = _mm256_set1_epi32(1);
        let r = _mm256_mask_rolv_epi32(a, 0, a, b);
        assert_eq_m256i(r, a);
        let r = _mm256_mask_rolv_epi32(a, 0b11111111, a, b);
        let e = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_rolv_epi32() {
        let a = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
        let b = _mm256_set1_epi32(1);
        let r = _mm256_maskz_rolv_epi32(0, a, b);
        assert_eq_m256i(r, _mm256_setzero_si256());
        let r = _mm256_maskz_rolv_epi32(0b11111111, a, b);
        let e = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_rolv_epi32() {
        let a = _mm_set_epi32(1 << 31, 1, 1, 1);
        let b = _mm_set1_epi32(1);
        let r = _mm_rolv_epi32(a, b);
        let e = _mm_set_epi32(1 << 0, 2, 2, 2);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_rolv_epi32() {
        let a = _mm_set_epi32(1 << 31, 1, 1, 1);
        let b = _mm_set1_epi32(1);
        let r = _mm_mask_rolv_epi32(a, 0, a, b);
        assert_eq_m128i(r, a);
        let r = _mm_mask_rolv_epi32(a, 0b00001111, a, b);
        let e = _mm_set_epi32(1 << 0, 2, 2, 2);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_rolv_epi32() {
        let a = _mm_set_epi32(1 << 31, 1, 1, 1);
        let b = _mm_set1_epi32(1);
        let r = _mm_maskz_rolv_epi32(0, a, b);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm_maskz_rolv_epi32(0b00001111, a, b);
        let e = _mm_set_epi32(1 << 0, 2, 2, 2);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_rorv_epi32() {
        let a = _mm512_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
        let b = _mm512_set1_epi32(1);
        let r = _mm512_rorv_epi32(a, b);
        let e = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_rorv_epi32() {
        let a = _mm512_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
        let b = _mm512_set1_epi32(1);
        let r = _mm512_mask_rorv_epi32(a, 0, a, b);
        assert_eq_m512i(r, a);
        let r = _mm512_mask_rorv_epi32(a, 0b11111111_11111111, a, b);
        let e = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_rorv_epi32() {
        let a = _mm512_set_epi32(3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1 << 0);
        let b = _mm512_set1_epi32(1);
        let r = _mm512_maskz_rorv_epi32(0, a, b);
        assert_eq_m512i(r, _mm512_setzero_si512());
        let r = _mm512_maskz_rorv_epi32(0b00000000_11111111, a, b);
        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1 << 31);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_rorv_epi32() {
        let a = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
        let b = _mm256_set1_epi32(1);
        let r = _mm256_rorv_epi32(a, b);
        let e = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_rorv_epi32() {
        let a = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
        let b = _mm256_set1_epi32(1);
        let r = _mm256_mask_rorv_epi32(a, 0, a, b);
        assert_eq_m256i(r, a);
        let r = _mm256_mask_rorv_epi32(a, 0b11111111, a, b);
        let e = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_rorv_epi32() {
        let a = _mm256_set_epi32(1 << 0, 2, 2, 2, 2, 2, 2, 2);
        let b = _mm256_set1_epi32(1);
        let r = _mm256_maskz_rorv_epi32(0, a, b);
        assert_eq_m256i(r, _mm256_setzero_si256());
        let r = _mm256_maskz_rorv_epi32(0b11111111, a, b);
        let e = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_rorv_epi32() {
        let a = _mm_set_epi32(1 << 0, 2, 2, 2);
        let b = _mm_set1_epi32(1);
        let r = _mm_rorv_epi32(a, b);
        let e = _mm_set_epi32(1 << 31, 1, 1, 1);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_rorv_epi32() {
        let a = _mm_set_epi32(1 << 0, 2, 2, 2);
        let b = _mm_set1_epi32(1);
        let r = _mm_mask_rorv_epi32(a, 0, a, b);
        assert_eq_m128i(r, a);
        let r = _mm_mask_rorv_epi32(a, 0b00001111, a, b);
        let e = _mm_set_epi32(1 << 31, 1, 1, 1);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_rorv_epi32() {
        let a = _mm_set_epi32(1 << 0, 2, 2, 2);
        let b = _mm_set1_epi32(1);
        let r = _mm_maskz_rorv_epi32(0, a, b);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm_maskz_rorv_epi32(0b00001111, a, b);
        let e = _mm_set_epi32(1 << 31, 1, 1, 1);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_sllv_epi32() {
        let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
        let count = _mm512_set1_epi32(1);
        let r = _mm512_sllv_epi32(a, count);
        let e = _mm512_set_epi32(0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_sllv_epi32() {
        let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
        let count = _mm512_set1_epi32(1);
        let r = _mm512_mask_sllv_epi32(a, 0, a, count);
        assert_eq_m512i(r, a);
        let r = _mm512_mask_sllv_epi32(a, 0b11111111_11111111, a, count);
        let e = _mm512_set_epi32(0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_sllv_epi32() {
        let a = _mm512_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 << 31);
        let count = _mm512_set_epi32(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
        let r = _mm512_maskz_sllv_epi32(0, a, count);
        assert_eq_m512i(r, _mm512_setzero_si512());
        let r = _mm512_maskz_sllv_epi32(0b00000000_11111111, a, count);
        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 0);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_sllv_epi32() {
        let a = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
        let count = _mm256_set1_epi32(1);
        let r = _mm256_mask_sllv_epi32(a, 0, a, count);
        assert_eq_m256i(r, a);
        let r = _mm256_mask_sllv_epi32(a, 0b11111111, a, count);
        let e = _mm256_set_epi32(0, 2, 2, 2, 2, 2, 2, 2);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_sllv_epi32() {
        let a = _mm256_set_epi32(1 << 31, 1, 1, 1, 1, 1, 1, 1);
        let count = _mm256_set1_epi32(1);
        let r = _mm256_maskz_sllv_epi32(0, a, count);
        assert_eq_m256i(r, _mm256_setzero_si256());
        let r = _mm256_maskz_sllv_epi32(0b11111111, a, count);
        let e = _mm256_set_epi32(0, 2, 2, 2, 2, 2, 2, 2);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_sllv_epi32() {
        let a = _mm_set_epi32(1 << 31, 1, 1, 1);
        let count = _mm_set1_epi32(1);
        let r = _mm_mask_sllv_epi32(a, 0, a, count);
        assert_eq_m128i(r, a);
        let r = _mm_mask_sllv_epi32(a, 0b00001111, a, count);
        let e = _mm_set_epi32(0, 2, 2, 2);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_sllv_epi32() {
        let a = _mm_set_epi32(1 << 31, 1, 1, 1);
        let count = _mm_set1_epi32(1);
        let r = _mm_maskz_sllv_epi32(0, a, count);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm_maskz_sllv_epi32(0b00001111, a, count);
        let e = _mm_set_epi32(0, 2, 2, 2);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_srlv_epi32() {
        let a = _mm512_set_epi32(0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
        let count = _mm512_set1_epi32(1);
        let r = _mm512_srlv_epi32(a, count);
        let e = _mm512_set_epi32(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_srlv_epi32() {
        let a = _mm512_set_epi32(0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2);
        let count = _mm512_set1_epi32(1);
        let r = _mm512_mask_srlv_epi32(a, 0, a, count);
        assert_eq_m512i(r, a);
        let r = _mm512_mask_srlv_epi32(a, 0b11111111_11111111, a, count);
        let e = _mm512_set_epi32(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_srlv_epi32() {
        let a = _mm512_set_epi32(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0);
        let count = _mm512_set_epi32(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
        let r = _mm512_maskz_srlv_epi32(0, a, count);
        assert_eq_m512i(r, _mm512_setzero_si512());
        let r = _mm512_maskz_srlv_epi32(0b00000000_11111111, a, count);
        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_srlv_epi32() {
        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
        let count = _mm256_set1_epi32(1);
        let r = _mm256_mask_srlv_epi32(a, 0, a, count);
        assert_eq_m256i(r, a);
        let r = _mm256_mask_srlv_epi32(a, 0b11111111, a, count);
        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_srlv_epi32() {
        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
        let count = _mm256_set1_epi32(1);
        let r = _mm256_maskz_srlv_epi32(0, a, count);
        assert_eq_m256i(r, _mm256_setzero_si256());
        let r = _mm256_maskz_srlv_epi32(0b11111111, a, count);
        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_srlv_epi32() {
        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
        let count = _mm_set1_epi32(1);
        let r = _mm_mask_srlv_epi32(a, 0, a, count);
        assert_eq_m128i(r, a);
        let r = _mm_mask_srlv_epi32(a, 0b00001111, a, count);
        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_srlv_epi32() {
        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
        let count = _mm_set1_epi32(1);
        let r = _mm_maskz_srlv_epi32(0, a, count);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm_maskz_srlv_epi32(0b00001111, a, count);
        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_sll_epi32() {
        #[rustfmt::skip]
        let a = _mm512_set_epi32(
            1 << 31, 1 << 0, 1 << 1, 1 << 2,
            0, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 0,
        );
        let count = _mm_set_epi32(0, 0, 0, 2);
        let r = _mm512_sll_epi32(a, count);
        #[rustfmt::skip]
        let e = _mm512_set_epi32(
            0, 1 << 2, 1 << 3, 1 << 4,
            0, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 0,
        );
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_sll_epi32() {
        #[rustfmt::skip]
        let a = _mm512_set_epi32(
            1 << 31, 1 << 0, 1 << 1, 1 << 2,
            0, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 0,
        );
        let count = _mm_set_epi32(0, 0, 0, 2);
        let r = _mm512_mask_sll_epi32(a, 0, a, count);
        assert_eq_m512i(r, a);
        let r = _mm512_mask_sll_epi32(a, 0b11111111_11111111, a, count);
        #[rustfmt::skip]
        let e = _mm512_set_epi32(
            0, 1 << 2, 1 << 3, 1 << 4,
            0, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 0,
        );
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_sll_epi32() {
        #[rustfmt::skip]
        let a = _mm512_set_epi32(
            1 << 31, 1 << 0, 1 << 1, 1 << 2,
            0, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 1 << 31,
        );
        let count = _mm_set_epi32(2, 0, 0, 2);
        let r = _mm512_maskz_sll_epi32(0, a, count);
        assert_eq_m512i(r, _mm512_setzero_si512());
        let r = _mm512_maskz_sll_epi32(0b00000000_11111111, a, count);
        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_sll_epi32() {
        let a = _mm256_set_epi32(1 << 13, 0, 0, 0, 0, 0, 0, 0);
        let count = _mm_set_epi32(0, 0, 0, 1);
        let r = _mm256_mask_sll_epi32(a, 0, a, count);
        assert_eq_m256i(r, a);
        let r = _mm256_mask_sll_epi32(a, 0b11111111, a, count);
        let e = _mm256_set_epi32(1 << 14, 0, 0, 0, 0, 0, 0, 0);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_sll_epi32() {
        let a = _mm256_set_epi32(1 << 13, 0, 0, 0, 0, 0, 0, 0);
        let count = _mm_set_epi32(0, 0, 0, 1);
        let r = _mm256_maskz_sll_epi32(0, a, count);
        assert_eq_m256i(r, _mm256_setzero_si256());
        let r = _mm256_maskz_sll_epi32(0b11111111, a, count);
        let e = _mm256_set_epi32(1 << 14, 0, 0, 0, 0, 0, 0, 0);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_sll_epi32() {
        let a = _mm_set_epi32(1 << 13, 0, 0, 0);
        let count = _mm_set_epi32(0, 0, 0, 1);
        let r = _mm_mask_sll_epi32(a, 0, a, count);
        assert_eq_m128i(r, a);
        let r = _mm_mask_sll_epi32(a, 0b00001111, a, count);
        let e = _mm_set_epi32(1 << 14, 0, 0, 0);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_sll_epi32() {
        let a = _mm_set_epi32(1 << 13, 0, 0, 0);
        let count = _mm_set_epi32(0, 0, 0, 1);
        let r = _mm_maskz_sll_epi32(0, a, count);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm_maskz_sll_epi32(0b00001111, a, count);
        let e = _mm_set_epi32(1 << 14, 0, 0, 0);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_srl_epi32() {
        #[rustfmt::skip]
        let a = _mm512_set_epi32(
            1 << 31, 1 << 0, 1 << 1, 1 << 2,
            0, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 0,
        );
        let count = _mm_set_epi32(0, 0, 0, 2);
        let r = _mm512_srl_epi32(a, count);
        let e = _mm512_set_epi32(1 << 29, 0, 0, 1 << 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_srl_epi32() {
        #[rustfmt::skip]
        let a = _mm512_set_epi32(
            1 << 31, 1 << 0, 1 << 1, 1 << 2,
            0, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 0,
        );
        let count = _mm_set_epi32(0, 0, 0, 2);
        let r = _mm512_mask_srl_epi32(a, 0, a, count);
        assert_eq_m512i(r, a);
        let r = _mm512_mask_srl_epi32(a, 0b11111111_11111111, a, count);
        let e = _mm512_set_epi32(1 << 29, 0, 0, 1 << 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_srl_epi32() {
        #[rustfmt::skip]
        let a = _mm512_set_epi32(
            1 << 31, 1 << 0, 1 << 1, 1 << 2,
            0, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 1 << 31,
        );
        let count = _mm_set_epi32(2, 0, 0, 2);
        let r = _mm512_maskz_srl_epi32(0, a, count);
        assert_eq_m512i(r, _mm512_setzero_si512());
        let r = _mm512_maskz_srl_epi32(0b00000000_11111111, a, count);
        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 << 29);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_srl_epi32() {
        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
        let count = _mm_set_epi32(0, 0, 0, 1);
        let r = _mm256_mask_srl_epi32(a, 0, a, count);
        assert_eq_m256i(r, a);
        let r = _mm256_mask_srl_epi32(a, 0b11111111, a, count);
        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_srl_epi32() {
        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
        let count = _mm_set_epi32(0, 0, 0, 1);
        let r = _mm256_maskz_srl_epi32(0, a, count);
        assert_eq_m256i(r, _mm256_setzero_si256());
        let r = _mm256_maskz_srl_epi32(0b11111111, a, count);
        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_srl_epi32() {
        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
        let count = _mm_set_epi32(0, 0, 0, 1);
        let r = _mm_mask_srl_epi32(a, 0, a, count);
        assert_eq_m128i(r, a);
        let r = _mm_mask_srl_epi32(a, 0b00001111, a, count);
        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_srl_epi32() {
        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
        let count = _mm_set_epi32(0, 0, 0, 1);
        let r = _mm_maskz_srl_epi32(0, a, count);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm_maskz_srl_epi32(0b00001111, a, count);
        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_sra_epi32() {
        let a = _mm512_set_epi32(8, -8, 16, -15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
        let count = _mm_set_epi32(1, 0, 0, 2);
        let r = _mm512_sra_epi32(a, count);
        let e = _mm512_set_epi32(2, -2, 4, -4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_sra_epi32() {
        let a = _mm512_set_epi32(8, -8, 16, -15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16);
        let count = _mm_set_epi32(0, 0, 0, 2);
        let r = _mm512_mask_sra_epi32(a, 0, a, count);
        assert_eq_m512i(r, a);
        let r = _mm512_mask_sra_epi32(a, 0b11111111_11111111, a, count);
        let e = _mm512_set_epi32(2, -2, 4, -4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_sra_epi32() {
        let a = _mm512_set_epi32(8, -8, 16, -15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -15, -14);
        let count = _mm_set_epi32(2, 0, 0, 2);
        let r = _mm512_maskz_sra_epi32(0, a, count);
        assert_eq_m512i(r, _mm512_setzero_si512());
        let r = _mm512_maskz_sra_epi32(0b00000000_11111111, a, count);
        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -4, -4);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_sra_epi32() {
        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
        let count = _mm_set_epi32(0, 0, 0, 1);
        let r = _mm256_mask_sra_epi32(a, 0, a, count);
        assert_eq_m256i(r, a);
        let r = _mm256_mask_sra_epi32(a, 0b11111111, a, count);
        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_sra_epi32() {
        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
        let count = _mm_set_epi32(0, 0, 0, 1);
        let r = _mm256_maskz_sra_epi32(0, a, count);
        assert_eq_m256i(r, _mm256_setzero_si256());
        let r = _mm256_maskz_sra_epi32(0b11111111, a, count);
        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_sra_epi32() {
        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
        let count = _mm_set_epi32(0, 0, 0, 1);
        let r = _mm_mask_sra_epi32(a, 0, a, count);
        assert_eq_m128i(r, a);
        let r = _mm_mask_sra_epi32(a, 0b00001111, a, count);
        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_sra_epi32() {
        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
        let count = _mm_set_epi32(0, 0, 0, 1);
        let r = _mm_maskz_sra_epi32(0, a, count);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm_maskz_sra_epi32(0b00001111, a, count);
        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_srav_epi32() {
        let a = _mm512_set_epi32(8, -8, 16, -15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
        let count = _mm512_set_epi32(2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
        let r = _mm512_srav_epi32(a, count);
        let e = _mm512_set_epi32(2, -2, 4, -4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_srav_epi32() {
        let a = _mm512_set_epi32(8, -8, 16, -15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16);
        let count = _mm512_set_epi32(2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1);
        let r = _mm512_mask_srav_epi32(a, 0, a, count);
        assert_eq_m512i(r, a);
        let r = _mm512_mask_srav_epi32(a, 0b11111111_11111111, a, count);
        let e = _mm512_set_epi32(2, -2, 4, -4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_srav_epi32() {
        let a = _mm512_set_epi32(8, -8, 16, -15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -15, -14);
        let count = _mm512_set_epi32(2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2);
        let r = _mm512_maskz_srav_epi32(0, a, count);
        assert_eq_m512i(r, _mm512_setzero_si512());
        let r = _mm512_maskz_srav_epi32(0b00000000_11111111, a, count);
        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -4, -4);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_srav_epi32() {
        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
        let count = _mm256_set1_epi32(1);
        let r = _mm256_mask_srav_epi32(a, 0, a, count);
        assert_eq_m256i(r, a);
        let r = _mm256_mask_srav_epi32(a, 0b11111111, a, count);
        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_srav_epi32() {
        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
        let count = _mm256_set1_epi32(1);
        let r = _mm256_maskz_srav_epi32(0, a, count);
        assert_eq_m256i(r, _mm256_setzero_si256());
        let r = _mm256_maskz_srav_epi32(0b11111111, a, count);
        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_srav_epi32() {
        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
        let count = _mm_set1_epi32(1);
        let r = _mm_mask_srav_epi32(a, 0, a, count);
        assert_eq_m128i(r, a);
        let r = _mm_mask_srav_epi32(a, 0b00001111, a, count);
        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_srav_epi32() {
        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
        let count = _mm_set1_epi32(1);
        let r = _mm_maskz_srav_epi32(0, a, count);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm_maskz_srav_epi32(0b00001111, a, count);
        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_srai_epi32() {
        let a = _mm512_set_epi32(8, -8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, -15);
        let r = _mm512_srai_epi32::<2>(a);
        let e = _mm512_set_epi32(2, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, -4);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_srai_epi32() {
        let a = _mm512_set_epi32(8, -8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, -15);
        let r = _mm512_mask_srai_epi32::<2>(a, 0, a);
        assert_eq_m512i(r, a);
        let r = _mm512_mask_srai_epi32::<2>(a, 0b11111111_11111111, a);
        let e = _mm512_set_epi32(2, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, -4);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_srai_epi32() {
        let a = _mm512_set_epi32(8, -8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, -15);
        let r = _mm512_maskz_srai_epi32::<2>(0, a);
        assert_eq_m512i(r, _mm512_setzero_si512());
        let r = _mm512_maskz_srai_epi32::<2>(0b00000000_11111111, a);
        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, -4);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_srai_epi32() {
        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
        let r = _mm256_mask_srai_epi32::<1>(a, 0, a);
        assert_eq_m256i(r, a);
        let r = _mm256_mask_srai_epi32::<1>(a, 0b11111111, a);
        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_srai_epi32() {
        let a = _mm256_set_epi32(1 << 5, 0, 0, 0, 0, 0, 0, 0);
        let r = _mm256_maskz_srai_epi32::<1>(0, a);
        assert_eq_m256i(r, _mm256_setzero_si256());
        let r = _mm256_maskz_srai_epi32::<1>(0b11111111, a);
        let e = _mm256_set_epi32(1 << 4, 0, 0, 0, 0, 0, 0, 0);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_srai_epi32() {
        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
        let r = _mm_mask_srai_epi32::<1>(a, 0, a);
        assert_eq_m128i(r, a);
        let r = _mm_mask_srai_epi32::<1>(a, 0b00001111, a);
        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_srai_epi32() {
        let a = _mm_set_epi32(1 << 5, 0, 0, 0);
        let r = _mm_maskz_srai_epi32::<1>(0, a);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm_maskz_srai_epi32::<1>(0b00001111, a);
        let e = _mm_set_epi32(1 << 4, 0, 0, 0);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_permute_ps() {
        let a = _mm512_setr_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        let r = _mm512_permute_ps::<0b11_11_11_11>(a);
        let e = _mm512_setr_ps(
            3., 3., 3., 3., 7., 7., 7., 7., 11., 11., 11., 11., 15., 15., 15., 15.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_permute_ps() {
        let a = _mm512_setr_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        let r = _mm512_mask_permute_ps::<0b11_11_11_11>(a, 0, a);
        assert_eq_m512(r, a);
        let r = _mm512_mask_permute_ps::<0b11_11_11_11>(a, 0b11111111_11111111, a);
        let e = _mm512_setr_ps(
            3., 3., 3., 3., 7., 7., 7., 7., 11., 11., 11., 11., 15., 15., 15., 15.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_permute_ps() {
        let a = _mm512_setr_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        let r = _mm512_maskz_permute_ps::<0b11_11_11_11>(0, a);
        assert_eq_m512(r, _mm512_setzero_ps());
        let r = _mm512_maskz_permute_ps::<0b11_11_11_11>(0b11111111_11111111, a);
        let e = _mm512_setr_ps(
            3., 3., 3., 3., 7., 7., 7., 7., 11., 11., 11., 11., 15., 15., 15., 15.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_permute_ps() {
        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
        let r = _mm256_mask_permute_ps::<0b11_11_11_11>(a, 0, a);
        assert_eq_m256(r, a);
        let r = _mm256_mask_permute_ps::<0b11_11_11_11>(a, 0b11111111, a);
        let e = _mm256_set_ps(0., 0., 0., 0., 4., 4., 4., 4.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_permute_ps() {
        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
        let r = _mm256_maskz_permute_ps::<0b11_11_11_11>(0, a);
        assert_eq_m256(r, _mm256_setzero_ps());
        let r = _mm256_maskz_permute_ps::<0b11_11_11_11>(0b11111111, a);
        let e = _mm256_set_ps(0., 0., 0., 0., 4., 4., 4., 4.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_permute_ps() {
        let a = _mm_set_ps(0., 1., 2., 3.);
        let r = _mm_mask_permute_ps::<0b11_11_11_11>(a, 0, a);
        assert_eq_m128(r, a);
        let r = _mm_mask_permute_ps::<0b11_11_11_11>(a, 0b00001111, a);
        let e = _mm_set_ps(0., 0., 0., 0.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_permute_ps() {
        let a = _mm_set_ps(0., 1., 2., 3.);
        let r = _mm_maskz_permute_ps::<0b11_11_11_11>(0, a);
        assert_eq_m128(r, _mm_setzero_ps());
        let r = _mm_maskz_permute_ps::<0b11_11_11_11>(0b00001111, a);
        let e = _mm_set_ps(0., 0., 0., 0.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_permutevar_epi32() {
        let idx = _mm512_set1_epi32(1);
        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let r = _mm512_permutevar_epi32(idx, a);
        let e = _mm512_set1_epi32(14);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_permutevar_epi32() {
        let idx = _mm512_set1_epi32(1);
        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let r = _mm512_mask_permutevar_epi32(a, 0, idx, a);
        assert_eq_m512i(r, a);
        let r = _mm512_mask_permutevar_epi32(a, 0b11111111_11111111, idx, a);
        let e = _mm512_set1_epi32(14);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_permutevar_ps() {
        let a = _mm512_set_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        let b = _mm512_set1_epi32(0b01);
        let r = _mm512_permutevar_ps(a, b);
        let e = _mm512_set_ps(
            2., 2., 2., 2., 6., 6., 6., 6., 10., 10., 10., 10., 14., 14., 14., 14.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_permutevar_ps() {
        let a = _mm512_set_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        let b = _mm512_set1_epi32(0b01);
        let r = _mm512_mask_permutevar_ps(a, 0, a, b);
        assert_eq_m512(r, a);
        let r = _mm512_mask_permutevar_ps(a, 0b11111111_11111111, a, b);
        let e = _mm512_set_ps(
            2., 2., 2., 2., 6., 6., 6., 6., 10., 10., 10., 10., 14., 14., 14., 14.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_permutevar_ps() {
        let a = _mm512_set_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        let b = _mm512_set1_epi32(0b01);
        let r = _mm512_maskz_permutevar_ps(0, a, b);
        assert_eq_m512(r, _mm512_setzero_ps());
        let r = _mm512_maskz_permutevar_ps(0b00000000_11111111, a, b);
        let e = _mm512_set_ps(
            0., 0., 0., 0., 0., 0., 0., 0., 10., 10., 10., 10., 14., 14., 14., 14.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_permutevar_ps() {
        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
        let b = _mm256_set1_epi32(0b01);
        let r = _mm256_mask_permutevar_ps(a, 0, a, b);
        assert_eq_m256(r, a);
        let r = _mm256_mask_permutevar_ps(a, 0b11111111, a, b);
        let e = _mm256_set_ps(2., 2., 2., 2., 6., 6., 6., 6.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_permutevar_ps() {
        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
        let b = _mm256_set1_epi32(0b01);
        let r = _mm256_maskz_permutevar_ps(0, a, b);
        assert_eq_m256(r, _mm256_setzero_ps());
        let r = _mm256_maskz_permutevar_ps(0b11111111, a, b);
        let e = _mm256_set_ps(2., 2., 2., 2., 6., 6., 6., 6.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_permutevar_ps() {
        let a = _mm_set_ps(0., 1., 2., 3.);
        let b = _mm_set1_epi32(0b01);
        let r = _mm_mask_permutevar_ps(a, 0, a, b);
        assert_eq_m128(r, a);
        let r = _mm_mask_permutevar_ps(a, 0b00001111, a, b);
        let e = _mm_set_ps(2., 2., 2., 2.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_permutevar_ps() {
        let a = _mm_set_ps(0., 1., 2., 3.);
        let b = _mm_set1_epi32(0b01);
        let r = _mm_maskz_permutevar_ps(0, a, b);
        assert_eq_m128(r, _mm_setzero_ps());
        let r = _mm_maskz_permutevar_ps(0b00001111, a, b);
        let e = _mm_set_ps(2., 2., 2., 2.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_permutexvar_epi32() {
        let idx = _mm512_set1_epi32(1);
        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let r = _mm512_permutexvar_epi32(idx, a);
        let e = _mm512_set1_epi32(14);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_permutexvar_epi32() {
        let idx = _mm512_set1_epi32(1);
        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let r = _mm512_mask_permutexvar_epi32(a, 0, idx, a);
        assert_eq_m512i(r, a);
        let r = _mm512_mask_permutexvar_epi32(a, 0b11111111_11111111, idx, a);
        let e = _mm512_set1_epi32(14);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_permutexvar_epi32() {
        let idx = _mm512_set1_epi32(1);
        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let r = _mm512_maskz_permutexvar_epi32(0, idx, a);
        assert_eq_m512i(r, _mm512_setzero_si512());
        let r = _mm512_maskz_permutexvar_epi32(0b00000000_11111111, idx, a);
        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 14, 14, 14, 14, 14, 14, 14, 14);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_permutexvar_epi32() {
        let idx = _mm256_set1_epi32(1);
        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
        let r = _mm256_permutexvar_epi32(idx, a);
        let e = _mm256_set1_epi32(6);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_permutexvar_epi32() {
        let idx = _mm256_set1_epi32(1);
        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
        let r = _mm256_mask_permutexvar_epi32(a, 0, idx, a);
        assert_eq_m256i(r, a);
        let r = _mm256_mask_permutexvar_epi32(a, 0b11111111, idx, a);
        let e = _mm256_set1_epi32(6);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_permutexvar_epi32() {
        let idx = _mm256_set1_epi32(1);
        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
        let r = _mm256_maskz_permutexvar_epi32(0, idx, a);
        assert_eq_m256i(r, _mm256_setzero_si256());
        let r = _mm256_maskz_permutexvar_epi32(0b11111111, idx, a);
        let e = _mm256_set1_epi32(6);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_permutexvar_ps() {
        let idx = _mm512_set1_epi32(1);
        let a = _mm512_set_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        let r = _mm512_permutexvar_ps(idx, a);
        let e = _mm512_set1_ps(14.);
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_permutexvar_ps() {
        let idx = _mm512_set1_epi32(1);
        let a = _mm512_set_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        let r = _mm512_mask_permutexvar_ps(a, 0, idx, a);
        assert_eq_m512(r, a);
        let r = _mm512_mask_permutexvar_ps(a, 0b11111111_11111111, idx, a);
        let e = _mm512_set1_ps(14.);
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_permutexvar_ps() {
        let idx = _mm512_set1_epi32(1);
        let a = _mm512_set_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        let r = _mm512_maskz_permutexvar_ps(0, idx, a);
        assert_eq_m512(r, _mm512_setzero_ps());
        let r = _mm512_maskz_permutexvar_ps(0b00000000_11111111, idx, a);
        let e = _mm512_set_ps(
            0., 0., 0., 0., 0., 0., 0., 0., 14., 14., 14., 14., 14., 14., 14., 14.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_permutexvar_ps() {
        let idx = _mm256_set1_epi32(1);
        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
        let r = _mm256_permutexvar_ps(idx, a);
        let e = _mm256_set1_ps(6.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_permutexvar_ps() {
        let idx = _mm256_set1_epi32(1);
        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
        let r = _mm256_mask_permutexvar_ps(a, 0, idx, a);
        assert_eq_m256(r, a);
        let r = _mm256_mask_permutexvar_ps(a, 0b11111111, idx, a);
        let e = _mm256_set1_ps(6.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_permutexvar_ps() {
        let idx = _mm256_set1_epi32(1);
        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
        let r = _mm256_maskz_permutexvar_ps(0, idx, a);
        assert_eq_m256(r, _mm256_setzero_ps());
        let r = _mm256_maskz_permutexvar_ps(0b11111111, idx, a);
        let e = _mm256_set1_ps(6.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_permutex2var_epi32() {
        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        #[rustfmt::skip]
        let idx = _mm512_set_epi32(
            1, 1 << 4, 2, 1 << 4,
            3, 1 << 4, 4, 1 << 4,
            5, 1 << 4, 6, 1 << 4,
            7, 1 << 4, 8, 1 << 4,
        );
        let b = _mm512_set1_epi32(100);
        let r = _mm512_permutex2var_epi32(a, idx, b);
        let e = _mm512_set_epi32(
            14, 100, 13, 100, 12, 100, 11, 100, 10, 100, 9, 100, 8, 100, 7, 100,
        );
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_permutex2var_epi32() {
        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        #[rustfmt::skip]
        let idx = _mm512_set_epi32(
            1, 1 << 4, 2, 1 << 4,
            3, 1 << 4, 4, 1 << 4,
            5, 1 << 4, 6, 1 << 4,
            7, 1 << 4, 8, 1 << 4,
        );
        let b = _mm512_set1_epi32(100);
        let r = _mm512_mask_permutex2var_epi32(a, 0, idx, b);
        assert_eq_m512i(r, a);
        let r = _mm512_mask_permutex2var_epi32(a, 0b11111111_11111111, idx, b);
        let e = _mm512_set_epi32(
            14, 100, 13, 100, 12, 100, 11, 100, 10, 100, 9, 100, 8, 100, 7, 100,
        );
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_permutex2var_epi32() {
        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        #[rustfmt::skip]
        let idx = _mm512_set_epi32(
            1, 1 << 4, 2, 1 << 4,
            3, 1 << 4, 4, 1 << 4,
            5, 1 << 4, 6, 1 << 4,
            7, 1 << 4, 8, 1 << 4,
        );
        let b = _mm512_set1_epi32(100);
        let r = _mm512_maskz_permutex2var_epi32(0, a, idx, b);
        assert_eq_m512i(r, _mm512_setzero_si512());
        let r = _mm512_maskz_permutex2var_epi32(0b00000000_11111111, a, idx, b);
        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 10, 100, 9, 100, 8, 100, 7, 100);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask2_permutex2var_epi32() {
        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        #[rustfmt::skip]
        let idx = _mm512_set_epi32(
            1000, 1 << 4, 2000, 1 << 4,
            3000, 1 << 4, 4000, 1 << 4,
            5, 1 << 4, 6, 1 << 4,
            7, 1 << 4, 8, 1 << 4,
        );
        let b = _mm512_set1_epi32(100);
        let r = _mm512_mask2_permutex2var_epi32(a, idx, 0, b);
        assert_eq_m512i(r, idx);
        let r = _mm512_mask2_permutex2var_epi32(a, idx, 0b00000000_11111111, b);
        #[rustfmt::skip]
        let e = _mm512_set_epi32(
            1000, 1 << 4, 2000, 1 << 4,
            3000, 1 << 4, 4000, 1 << 4,
            10, 100, 9, 100,
            8, 100, 7, 100,
        );
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_permutex2var_epi32() {
        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
        let idx = _mm256_set_epi32(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
        let b = _mm256_set1_epi32(100);
        let r = _mm256_permutex2var_epi32(a, idx, b);
        let e = _mm256_set_epi32(6, 100, 5, 100, 4, 100, 3, 100);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_permutex2var_epi32() {
        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
        let idx = _mm256_set_epi32(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
        let b = _mm256_set1_epi32(100);
        let r = _mm256_mask_permutex2var_epi32(a, 0, idx, b);
        assert_eq_m256i(r, a);
        let r = _mm256_mask_permutex2var_epi32(a, 0b11111111, idx, b);
        let e = _mm256_set_epi32(6, 100, 5, 100, 4, 100, 3, 100);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_permutex2var_epi32() {
        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
        let idx = _mm256_set_epi32(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
        let b = _mm256_set1_epi32(100);
        let r = _mm256_maskz_permutex2var_epi32(0, a, idx, b);
        assert_eq_m256i(r, _mm256_setzero_si256());
        let r = _mm256_maskz_permutex2var_epi32(0b11111111, a, idx, b);
        let e = _mm256_set_epi32(6, 100, 5, 100, 4, 100, 3, 100);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask2_permutex2var_epi32() {
        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
        let idx = _mm256_set_epi32(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
        let b = _mm256_set1_epi32(100);
        let r = _mm256_mask2_permutex2var_epi32(a, idx, 0, b);
        assert_eq_m256i(r, idx);
        let r = _mm256_mask2_permutex2var_epi32(a, idx, 0b11111111, b);
        let e = _mm256_set_epi32(6, 100, 5, 100, 4, 100, 3, 100);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_permutex2var_epi32() {
        let a = _mm_set_epi32(0, 1, 2, 3);
        let idx = _mm_set_epi32(1, 1 << 2, 2, 1 << 2);
        let b = _mm_set1_epi32(100);
        let r = _mm_permutex2var_epi32(a, idx, b);
        let e = _mm_set_epi32(2, 100, 1, 100);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_permutex2var_epi32() {
        let a = _mm_set_epi32(0, 1, 2, 3);
        let idx = _mm_set_epi32(1, 1 << 2, 2, 1 << 2);
        let b = _mm_set1_epi32(100);
        let r = _mm_mask_permutex2var_epi32(a, 0, idx, b);
        assert_eq_m128i(r, a);
        let r = _mm_mask_permutex2var_epi32(a, 0b00001111, idx, b);
        let e = _mm_set_epi32(2, 100, 1, 100);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_permutex2var_epi32() {
        let a = _mm_set_epi32(0, 1, 2, 3);
        let idx = _mm_set_epi32(1, 1 << 2, 2, 1 << 2);
        let b = _mm_set1_epi32(100);
        let r = _mm_maskz_permutex2var_epi32(0, a, idx, b);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm_maskz_permutex2var_epi32(0b00001111, a, idx, b);
        let e = _mm_set_epi32(2, 100, 1, 100);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask2_permutex2var_epi32() {
        let a = _mm_set_epi32(0, 1, 2, 3);
        let idx = _mm_set_epi32(1, 1 << 2, 2, 1 << 2);
        let b = _mm_set1_epi32(100);
        let r = _mm_mask2_permutex2var_epi32(a, idx, 0, b);
        assert_eq_m128i(r, idx);
        let r = _mm_mask2_permutex2var_epi32(a, idx, 0b00001111, b);
        let e = _mm_set_epi32(2, 100, 1, 100);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_permutex2var_ps() {
        let a = _mm512_set_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        #[rustfmt::skip]
        let idx = _mm512_set_epi32(
            1, 1 << 4, 2, 1 << 4,
            3, 1 << 4, 4, 1 << 4,
            5, 1 << 4, 6, 1 << 4,
            7, 1 << 4, 8, 1 << 4,
        );
        let b = _mm512_set1_ps(100.);
        let r = _mm512_permutex2var_ps(a, idx, b);
        let e = _mm512_set_ps(
            14., 100., 13., 100., 12., 100., 11., 100., 10., 100., 9., 100., 8., 100., 7., 100.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_permutex2var_ps() {
        let a = _mm512_set_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        #[rustfmt::skip]
        let idx = _mm512_set_epi32(
            1, 1 << 4, 2, 1 << 4,
            3, 1 << 4, 4, 1 << 4,
            5, 1 << 4, 6, 1 << 4,
            7, 1 << 4, 8, 1 << 4,
        );
        let b = _mm512_set1_ps(100.);
        let r = _mm512_mask_permutex2var_ps(a, 0, idx, b);
        assert_eq_m512(r, a);
        let r = _mm512_mask_permutex2var_ps(a, 0b11111111_11111111, idx, b);
        let e = _mm512_set_ps(
            14., 100., 13., 100., 12., 100., 11., 100., 10., 100., 9., 100., 8., 100., 7., 100.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_permutex2var_ps() {
        let a = _mm512_set_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        #[rustfmt::skip]
        let idx = _mm512_set_epi32(
            1, 1 << 4, 2, 1 << 4,
            3, 1 << 4, 4, 1 << 4,
            5, 1 << 4, 6, 1 << 4,
            7, 1 << 4, 8, 1 << 4,
        );
        let b = _mm512_set1_ps(100.);
        let r = _mm512_maskz_permutex2var_ps(0, a, idx, b);
        assert_eq_m512(r, _mm512_setzero_ps());
        let r = _mm512_maskz_permutex2var_ps(0b00000000_11111111, a, idx, b);
        let e = _mm512_set_ps(
            0., 0., 0., 0., 0., 0., 0., 0., 10., 100., 9., 100., 8., 100., 7., 100.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask2_permutex2var_ps() {
        let a = _mm512_set_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        #[rustfmt::skip]
        let idx = _mm512_set_epi32(
            1, 1 << 4, 2, 1 << 4,
            3, 1 << 4, 4, 1 << 4,
            5, 1 << 4, 6, 1 << 4,
            7, 1 << 4, 8, 1 << 4,
        );
        let b = _mm512_set1_ps(100.);
        let r = _mm512_mask2_permutex2var_ps(a, idx, 0, b);
        assert_eq_m512(r, _mm512_castsi512_ps(idx));
        let r = _mm512_mask2_permutex2var_ps(a, idx, 0b11111111_11111111, b);
        let e = _mm512_set_ps(
            14., 100., 13., 100., 12., 100., 11., 100., 10., 100., 9., 100., 8., 100., 7., 100.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_permutex2var_ps() {
        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
        let idx = _mm256_set_epi32(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
        let b = _mm256_set1_ps(100.);
        let r = _mm256_permutex2var_ps(a, idx, b);
        let e = _mm256_set_ps(6., 100., 5., 100., 4., 100., 3., 100.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_permutex2var_ps() {
        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
        let idx = _mm256_set_epi32(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
        let b = _mm256_set1_ps(100.);
        let r = _mm256_mask_permutex2var_ps(a, 0, idx, b);
        assert_eq_m256(r, a);
        let r = _mm256_mask_permutex2var_ps(a, 0b11111111, idx, b);
        let e = _mm256_set_ps(6., 100., 5., 100., 4., 100., 3., 100.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_permutex2var_ps() {
        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
        let idx = _mm256_set_epi32(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
        let b = _mm256_set1_ps(100.);
        let r = _mm256_maskz_permutex2var_ps(0, a, idx, b);
        assert_eq_m256(r, _mm256_setzero_ps());
        let r = _mm256_maskz_permutex2var_ps(0b11111111, a, idx, b);
        let e = _mm256_set_ps(6., 100., 5., 100., 4., 100., 3., 100.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask2_permutex2var_ps() {
        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
        let idx = _mm256_set_epi32(1, 1 << 3, 2, 1 << 3, 3, 1 << 3, 4, 1 << 3);
        let b = _mm256_set1_ps(100.);
        let r = _mm256_mask2_permutex2var_ps(a, idx, 0, b);
        assert_eq_m256(r, _mm256_castsi256_ps(idx));
        let r = _mm256_mask2_permutex2var_ps(a, idx, 0b11111111, b);
        let e = _mm256_set_ps(6., 100., 5., 100., 4., 100., 3., 100.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_permutex2var_ps() {
        let a = _mm_set_ps(0., 1., 2., 3.);
        let idx = _mm_set_epi32(1, 1 << 2, 2, 1 << 2);
        let b = _mm_set1_ps(100.);
        let r = _mm_permutex2var_ps(a, idx, b);
        let e = _mm_set_ps(2., 100., 1., 100.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_permutex2var_ps() {
        let a = _mm_set_ps(0., 1., 2., 3.);
        let idx = _mm_set_epi32(1, 1 << 2, 2, 1 << 2);
        let b = _mm_set1_ps(100.);
        let r = _mm_mask_permutex2var_ps(a, 0, idx, b);
        assert_eq_m128(r, a);
        let r = _mm_mask_permutex2var_ps(a, 0b00001111, idx, b);
        let e = _mm_set_ps(2., 100., 1., 100.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_permutex2var_ps() {
        let a = _mm_set_ps(0., 1., 2., 3.);
        let idx = _mm_set_epi32(1, 1 << 2, 2, 1 << 2);
        let b = _mm_set1_ps(100.);
        let r = _mm_maskz_permutex2var_ps(0, a, idx, b);
        assert_eq_m128(r, _mm_setzero_ps());
        let r = _mm_maskz_permutex2var_ps(0b00001111, a, idx, b);
        let e = _mm_set_ps(2., 100., 1., 100.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask2_permutex2var_ps() {
        let a = _mm_set_ps(0., 1., 2., 3.);
        let idx = _mm_set_epi32(1, 1 << 2, 2, 1 << 2);
        let b = _mm_set1_ps(100.);
        let r = _mm_mask2_permutex2var_ps(a, idx, 0, b);
        assert_eq_m128(r, _mm_castsi128_ps(idx));
        let r = _mm_mask2_permutex2var_ps(a, idx, 0b00001111, b);
        let e = _mm_set_ps(2., 100., 1., 100.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_shuffle_epi32() {
        let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16);
        let r = _mm512_shuffle_epi32::<_MM_PERM_AADD>(a);
        let e = _mm512_setr_epi32(8, 8, 1, 1, 16, 16, 9, 9, 8, 8, 1, 1, 16, 16, 9, 9);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_shuffle_epi32() {
        let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16);
        let r = _mm512_mask_shuffle_epi32::<_MM_PERM_AADD>(a, 0, a);
        assert_eq_m512i(r, a);
        let r = _mm512_mask_shuffle_epi32::<_MM_PERM_AADD>(a, 0b11111111_11111111, a);
        let e = _mm512_setr_epi32(8, 8, 1, 1, 16, 16, 9, 9, 8, 8, 1, 1, 16, 16, 9, 9);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_shuffle_epi32() {
        let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16);
        let r = _mm512_maskz_shuffle_epi32::<_MM_PERM_AADD>(0, a);
        assert_eq_m512i(r, _mm512_setzero_si512());
        let r = _mm512_maskz_shuffle_epi32::<_MM_PERM_AADD>(0b00000000_11111111, a);
        let e = _mm512_setr_epi32(8, 8, 1, 1, 16, 16, 9, 9, 0, 0, 0, 0, 0, 0, 0, 0);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_shuffle_epi32() {
        let a = _mm256_set_epi32(1, 4, 5, 8, 9, 12, 13, 16);
        let r = _mm256_mask_shuffle_epi32::<_MM_PERM_AADD>(a, 0, a);
        assert_eq_m256i(r, a);
        let r = _mm256_mask_shuffle_epi32::<_MM_PERM_AADD>(a, 0b11111111, a);
        let e = _mm256_set_epi32(8, 8, 1, 1, 16, 16, 9, 9);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_shuffle_epi32() {
        let a = _mm256_set_epi32(1, 4, 5, 8, 9, 12, 13, 16);
        let r = _mm256_maskz_shuffle_epi32::<_MM_PERM_AADD>(0, a);
        assert_eq_m256i(r, _mm256_setzero_si256());
        let r = _mm256_maskz_shuffle_epi32::<_MM_PERM_AADD>(0b11111111, a);
        let e = _mm256_set_epi32(8, 8, 1, 1, 16, 16, 9, 9);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_shuffle_epi32() {
        let a = _mm_set_epi32(1, 4, 5, 8);
        let r = _mm_mask_shuffle_epi32::<_MM_PERM_AADD>(a, 0, a);
        assert_eq_m128i(r, a);
        let r = _mm_mask_shuffle_epi32::<_MM_PERM_AADD>(a, 0b00001111, a);
        let e = _mm_set_epi32(8, 8, 1, 1);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_shuffle_epi32() {
        let a = _mm_set_epi32(1, 4, 5, 8);
        let r = _mm_maskz_shuffle_epi32::<_MM_PERM_AADD>(0, a);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm_maskz_shuffle_epi32::<_MM_PERM_AADD>(0b00001111, a);
        let e = _mm_set_epi32(8, 8, 1, 1);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_shuffle_ps() {
        let a = _mm512_setr_ps(
            1., 4., 5., 8., 9., 12., 13., 16., 1., 4., 5., 8., 9., 12., 13., 16.,
        );
        let b = _mm512_setr_ps(
            2., 3., 6., 7., 10., 11., 14., 15., 2., 3., 6., 7., 10., 11., 14., 15.,
        );
        let r = _mm512_shuffle_ps::<0b00_00_11_11>(a, b);
        let e = _mm512_setr_ps(
            8., 8., 2., 2., 16., 16., 10., 10., 8., 8., 2., 2., 16., 16., 10., 10.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_shuffle_ps() {
        let a = _mm512_setr_ps(
            1., 4., 5., 8., 9., 12., 13., 16., 1., 4., 5., 8., 9., 12., 13., 16.,
        );
        let b = _mm512_setr_ps(
            2., 3., 6., 7., 10., 11., 14., 15., 2., 3., 6., 7., 10., 11., 14., 15.,
        );
        let r = _mm512_mask_shuffle_ps::<0b00_00_11_11>(a, 0, a, b);
        assert_eq_m512(r, a);
        let r = _mm512_mask_shuffle_ps::<0b00_00_11_11>(a, 0b11111111_11111111, a, b);
        let e = _mm512_setr_ps(
            8., 8., 2., 2., 16., 16., 10., 10., 8., 8., 2., 2., 16., 16., 10., 10.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_shuffle_ps() {
        let a = _mm512_setr_ps(
            1., 4., 5., 8., 9., 12., 13., 16., 1., 4., 5., 8., 9., 12., 13., 16.,
        );
        let b = _mm512_setr_ps(
            2., 3., 6., 7., 10., 11., 14., 15., 2., 3., 6., 7., 10., 11., 14., 15.,
        );
        let r = _mm512_maskz_shuffle_ps::<0b00_00_11_11>(0, a, b);
        assert_eq_m512(r, _mm512_setzero_ps());
        let r = _mm512_maskz_shuffle_ps::<0b00_00_11_11>(0b00000000_11111111, a, b);
        let e = _mm512_setr_ps(
            8., 8., 2., 2., 16., 16., 10., 10., 0., 0., 0., 0., 0., 0., 0., 0.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_shuffle_ps() {
        let a = _mm256_set_ps(1., 4., 5., 8., 9., 12., 13., 16.);
        let b = _mm256_set_ps(2., 3., 6., 7., 10., 11., 14., 15.);
        let r = _mm256_mask_shuffle_ps::<0b11_11_11_11>(a, 0, a, b);
        assert_eq_m256(r, a);
        let r = _mm256_mask_shuffle_ps::<0b00_00_11_11>(a, 0b11111111, a, b);
        let e = _mm256_set_ps(7., 7., 1., 1., 15., 15., 9., 9.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_shuffle_ps() {
        let a = _mm256_set_ps(1., 4., 5., 8., 9., 12., 13., 16.);
        let b = _mm256_set_ps(2., 3., 6., 7., 10., 11., 14., 15.);
        let r = _mm256_maskz_shuffle_ps::<0b11_11_11_11>(0, a, b);
        assert_eq_m256(r, _mm256_setzero_ps());
        let r = _mm256_maskz_shuffle_ps::<0b00_00_11_11>(0b11111111, a, b);
        let e = _mm256_set_ps(7., 7., 1., 1., 15., 15., 9., 9.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_shuffle_ps() {
        let a = _mm_set_ps(1., 4., 5., 8.);
        let b = _mm_set_ps(2., 3., 6., 7.);
        let r = _mm_mask_shuffle_ps::<0b11_11_11_11>(a, 0, a, b);
        assert_eq_m128(r, a);
        let r = _mm_mask_shuffle_ps::<0b00_00_11_11>(a, 0b00001111, a, b);
        let e = _mm_set_ps(7., 7., 1., 1.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_shuffle_ps() {
        let a = _mm_set_ps(1., 4., 5., 8.);
        let b = _mm_set_ps(2., 3., 6., 7.);
        let r = _mm_maskz_shuffle_ps::<0b11_11_11_11>(0, a, b);
        assert_eq_m128(r, _mm_setzero_ps());
        let r = _mm_maskz_shuffle_ps::<0b00_00_11_11>(0b00001111, a, b);
        let e = _mm_set_ps(7., 7., 1., 1.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_shuffle_i32x4() {
        let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16);
        let b = _mm512_setr_epi32(2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15);
        let r = _mm512_shuffle_i32x4::<0b00_00_00_00>(a, b);
        let e = _mm512_setr_epi32(1, 4, 5, 8, 1, 4, 5, 8, 2, 3, 6, 7, 2, 3, 6, 7);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_shuffle_i32x4() {
        let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16);
        let b = _mm512_setr_epi32(2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15);
        let r = _mm512_mask_shuffle_i32x4::<0b00_00_00_00>(a, 0, a, b);
        assert_eq_m512i(r, a);
        let r = _mm512_mask_shuffle_i32x4::<0b00_00_00_00>(a, 0b11111111_11111111, a, b);
        let e = _mm512_setr_epi32(1, 4, 5, 8, 1, 4, 5, 8, 2, 3, 6, 7, 2, 3, 6, 7);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_shuffle_i32x4() {
        let a = _mm512_setr_epi32(1, 4, 5, 8, 9, 12, 13, 16, 1, 4, 5, 8, 9, 12, 13, 16);
        let b = _mm512_setr_epi32(2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15);
        let r = _mm512_maskz_shuffle_i32x4::<0b00_00_00_00>(0, a, b);
        assert_eq_m512i(r, _mm512_setzero_si512());
        let r = _mm512_maskz_shuffle_i32x4::<0b00_00_00_00>(0b00000000_11111111, a, b);
        let e = _mm512_setr_epi32(1, 4, 5, 8, 1, 4, 5, 8, 0, 0, 0, 0, 0, 0, 0, 0);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_shuffle_i32x4() {
        let a = _mm256_set_epi32(1, 4, 5, 8, 9, 12, 13, 16);
        let b = _mm256_set_epi32(2, 3, 6, 7, 10, 11, 14, 15);
        let r = _mm256_shuffle_i32x4::<0b00>(a, b);
        let e = _mm256_set_epi32(10, 11, 14, 15, 9, 12, 13, 16);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_shuffle_i32x4() {
        let a = _mm256_set_epi32(1, 4, 5, 8, 9, 12, 13, 16);
        let b = _mm256_set_epi32(2, 3, 6, 7, 10, 11, 14, 15);
        let r = _mm256_mask_shuffle_i32x4::<0b00>(a, 0, a, b);
        assert_eq_m256i(r, a);
        let r = _mm256_mask_shuffle_i32x4::<0b00>(a, 0b11111111, a, b);
        let e = _mm256_set_epi32(10, 11, 14, 15, 9, 12, 13, 16);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_shuffle_i32x4() {
        let a = _mm256_set_epi32(1, 4, 5, 8, 9, 12, 13, 16);
        let b = _mm256_set_epi32(2, 3, 6, 7, 10, 11, 14, 15);
        let r = _mm256_maskz_shuffle_i32x4::<0b00>(0, a, b);
        assert_eq_m256i(r, _mm256_setzero_si256());
        let r = _mm256_maskz_shuffle_i32x4::<0b00>(0b11111111, a, b);
        let e = _mm256_set_epi32(10, 11, 14, 15, 9, 12, 13, 16);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_shuffle_f32x4() {
        let a = _mm512_setr_ps(
            1., 4., 5., 8., 9., 12., 13., 16., 1., 4., 5., 8., 9., 12., 13., 16.,
        );
        let b = _mm512_setr_ps(
            2., 3., 6., 7., 10., 11., 14., 15., 2., 3., 6., 7., 10., 11., 14., 15.,
        );
        let r = _mm512_shuffle_f32x4::<0b00_00_00_00>(a, b);
        let e = _mm512_setr_ps(
            1., 4., 5., 8., 1., 4., 5., 8., 2., 3., 6., 7., 2., 3., 6., 7.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_shuffle_f32x4() {
        let a = _mm512_setr_ps(
            1., 4., 5., 8., 9., 12., 13., 16., 1., 4., 5., 8., 9., 12., 13., 16.,
        );
        let b = _mm512_setr_ps(
            2., 3., 6., 7., 10., 11., 14., 15., 2., 3., 6., 7., 10., 11., 14., 15.,
        );
        let r = _mm512_mask_shuffle_f32x4::<0b00_00_00_00>(a, 0, a, b);
        assert_eq_m512(r, a);
        let r = _mm512_mask_shuffle_f32x4::<0b00_00_00_00>(a, 0b11111111_11111111, a, b);
        let e = _mm512_setr_ps(
            1., 4., 5., 8., 1., 4., 5., 8., 2., 3., 6., 7., 2., 3., 6., 7.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_shuffle_f32x4() {
        let a = _mm512_setr_ps(
            1., 4., 5., 8., 9., 12., 13., 16., 1., 4., 5., 8., 9., 12., 13., 16.,
        );
        let b = _mm512_setr_ps(
            2., 3., 6., 7., 10., 11., 14., 15., 2., 3., 6., 7., 10., 11., 14., 15.,
        );
        let r = _mm512_maskz_shuffle_f32x4::<0b00_00_00_00>(0, a, b);
        assert_eq_m512(r, _mm512_setzero_ps());
        let r = _mm512_maskz_shuffle_f32x4::<0b00_00_00_00>(0b00000000_11111111, a, b);
        let e = _mm512_setr_ps(
            1., 4., 5., 8., 1., 4., 5., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_shuffle_f32x4() {
        let a = _mm256_set_ps(1., 4., 5., 8., 9., 12., 13., 16.);
        let b = _mm256_set_ps(2., 3., 6., 7., 10., 11., 14., 15.);
        let r = _mm256_shuffle_f32x4::<0b00>(a, b);
        let e = _mm256_set_ps(10., 11., 14., 15., 9., 12., 13., 16.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_shuffle_f32x4() {
        let a = _mm256_set_ps(1., 4., 5., 8., 9., 12., 13., 16.);
        let b = _mm256_set_ps(2., 3., 6., 7., 10., 11., 14., 15.);
        let r = _mm256_mask_shuffle_f32x4::<0b00>(a, 0, a, b);
        assert_eq_m256(r, a);
        let r = _mm256_mask_shuffle_f32x4::<0b00>(a, 0b11111111, a, b);
        let e = _mm256_set_ps(10., 11., 14., 15., 9., 12., 13., 16.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_shuffle_f32x4() {
        let a = _mm256_set_ps(1., 4., 5., 8., 9., 12., 13., 16.);
        let b = _mm256_set_ps(2., 3., 6., 7., 10., 11., 14., 15.);
        let r = _mm256_maskz_shuffle_f32x4::<0b00>(0, a, b);
        assert_eq_m256(r, _mm256_setzero_ps());
        let r = _mm256_maskz_shuffle_f32x4::<0b00>(0b11111111, a, b);
        let e = _mm256_set_ps(10., 11., 14., 15., 9., 12., 13., 16.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_extractf32x4_ps() {
        let a = _mm512_setr_ps(
            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
        );
        let r = _mm512_extractf32x4_ps::<1>(a);
        let e = _mm_setr_ps(5., 6., 7., 8.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_extractf32x4_ps() {
        let a = _mm512_setr_ps(
            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
        );
        let src = _mm_set1_ps(100.);
        let r = _mm512_mask_extractf32x4_ps::<1>(src, 0, a);
        assert_eq_m128(r, src);
        let r = _mm512_mask_extractf32x4_ps::<1>(src, 0b11111111, a);
        let e = _mm_setr_ps(5., 6., 7., 8.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_extractf32x4_ps() {
        let a = _mm512_setr_ps(
            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
        );
        let r = _mm512_maskz_extractf32x4_ps::<1>(0, a);
        assert_eq_m128(r, _mm_setzero_ps());
        let r = _mm512_maskz_extractf32x4_ps::<1>(0b00000001, a);
        let e = _mm_setr_ps(5., 0., 0., 0.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_extractf32x4_ps() {
        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
        let r = _mm256_extractf32x4_ps::<1>(a);
        let e = _mm_set_ps(1., 2., 3., 4.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_extractf32x4_ps() {
        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
        let src = _mm_set1_ps(100.);
        let r = _mm256_mask_extractf32x4_ps::<1>(src, 0, a);
        assert_eq_m128(r, src);
        let r = _mm256_mask_extractf32x4_ps::<1>(src, 0b00001111, a);
        let e = _mm_set_ps(1., 2., 3., 4.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_extractf32x4_ps() {
        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
        let r = _mm256_maskz_extractf32x4_ps::<1>(0, a);
        assert_eq_m128(r, _mm_setzero_ps());
        let r = _mm256_maskz_extractf32x4_ps::<1>(0b00001111, a);
        let e = _mm_set_ps(1., 2., 3., 4.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_extracti32x4_epi32() {
        let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
        let r = _mm512_extracti32x4_epi32::<1>(a);
        let e = _mm_setr_epi32(5, 6, 7, 8);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_extracti32x4_epi32() {
        let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
        let src = _mm_set1_epi32(100);
        let r = _mm512_mask_extracti32x4_epi32::<1>(src, 0, a);
        assert_eq_m128i(r, src);
        let r = _mm512_mask_extracti32x4_epi32::<1>(src, 0b11111111, a);
        let e = _mm_setr_epi32(5, 6, 7, 8);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm512_maskz_extracti32x4_epi32() {
        let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
        let r = _mm512_maskz_extracti32x4_epi32::<1>(0, a);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm512_maskz_extracti32x4_epi32::<1>(0b00000001, a);
        let e = _mm_setr_epi32(5, 0, 0, 0);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_extracti32x4_epi32() {
        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
        let r = _mm256_extracti32x4_epi32::<1>(a);
        let e = _mm_set_epi32(1, 2, 3, 4);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_extracti32x4_epi32() {
        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
        let src = _mm_set1_epi32(100);
        let r = _mm256_mask_extracti32x4_epi32::<1>(src, 0, a);
        assert_eq_m128i(r, src);
        let r = _mm256_mask_extracti32x4_epi32::<1>(src, 0b00001111, a);
        let e = _mm_set_epi32(1, 2, 3, 4);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_extracti32x4_epi32() {
        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
        let r = _mm256_maskz_extracti32x4_epi32::<1>(0, a);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm256_maskz_extracti32x4_epi32::<1>(0b00001111, a);
        let e = _mm_set_epi32(1, 2, 3, 4);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_moveldup_ps() {
        let a = _mm512_setr_ps(
            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
        );
        let r = _mm512_moveldup_ps(a);
        let e = _mm512_setr_ps(
            1., 1., 3., 3., 5., 5., 7., 7., 9., 9., 11., 11., 13., 13., 15., 15.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_moveldup_ps() {
        let a = _mm512_setr_ps(
            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
        );
        let r = _mm512_mask_moveldup_ps(a, 0, a);
        assert_eq_m512(r, a);
        let r = _mm512_mask_moveldup_ps(a, 0b11111111_11111111, a);
        let e = _mm512_setr_ps(
            1., 1., 3., 3., 5., 5., 7., 7., 9., 9., 11., 11., 13., 13., 15., 15.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_moveldup_ps() {
        let a = _mm512_setr_ps(
            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
        );
        let r = _mm512_maskz_moveldup_ps(0, a);
        assert_eq_m512(r, _mm512_setzero_ps());
        let r = _mm512_maskz_moveldup_ps(0b00000000_11111111, a);
        let e = _mm512_setr_ps(
            1., 1., 3., 3., 5., 5., 7., 7., 0., 0., 0., 0., 0., 0., 0., 0.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_moveldup_ps() {
        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
        let r = _mm256_mask_moveldup_ps(a, 0, a);
        assert_eq_m256(r, a);
        let r = _mm256_mask_moveldup_ps(a, 0b11111111, a);
        let e = _mm256_set_ps(2., 2., 4., 4., 6., 6., 8., 8.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_moveldup_ps() {
        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
        let r = _mm256_maskz_moveldup_ps(0, a);
        assert_eq_m256(r, _mm256_setzero_ps());
        let r = _mm256_maskz_moveldup_ps(0b11111111, a);
        let e = _mm256_set_ps(2., 2., 4., 4., 6., 6., 8., 8.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_moveldup_ps() {
        let a = _mm_set_ps(1., 2., 3., 4.);
        let r = _mm_mask_moveldup_ps(a, 0, a);
        assert_eq_m128(r, a);
        let r = _mm_mask_moveldup_ps(a, 0b00001111, a);
        let e = _mm_set_ps(2., 2., 4., 4.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_moveldup_ps() {
        let a = _mm_set_ps(1., 2., 3., 4.);
        let r = _mm_maskz_moveldup_ps(0, a);
        assert_eq_m128(r, _mm_setzero_ps());
        let r = _mm_maskz_moveldup_ps(0b00001111, a);
        let e = _mm_set_ps(2., 2., 4., 4.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_movehdup_ps() {
        let a = _mm512_setr_ps(
            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
        );
        let r = _mm512_movehdup_ps(a);
        let e = _mm512_setr_ps(
            2., 2., 4., 4., 6., 6., 8., 8., 10., 10., 12., 12., 14., 14., 16., 16.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_movehdup_ps() {
        let a = _mm512_setr_ps(
            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
        );
        let r = _mm512_mask_movehdup_ps(a, 0, a);
        assert_eq_m512(r, a);
        let r = _mm512_mask_movehdup_ps(a, 0b11111111_11111111, a);
        let e = _mm512_setr_ps(
            2., 2., 4., 4., 6., 6., 8., 8., 10., 10., 12., 12., 14., 14., 16., 16.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_movehdup_ps() {
        let a = _mm512_setr_ps(
            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
        );
        let r = _mm512_maskz_movehdup_ps(0, a);
        assert_eq_m512(r, _mm512_setzero_ps());
        let r = _mm512_maskz_movehdup_ps(0b00000000_11111111, a);
        let e = _mm512_setr_ps(
            2., 2., 4., 4., 6., 6., 8., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_movehdup_ps() {
        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
        let r = _mm256_mask_movehdup_ps(a, 0, a);
        assert_eq_m256(r, a);
        let r = _mm256_mask_movehdup_ps(a, 0b11111111, a);
        let e = _mm256_set_ps(1., 1., 3., 3., 5., 5., 7., 7.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_movehdup_ps() {
        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
        let r = _mm256_maskz_movehdup_ps(0, a);
        assert_eq_m256(r, _mm256_setzero_ps());
        let r = _mm256_maskz_movehdup_ps(0b11111111, a);
        let e = _mm256_set_ps(1., 1., 3., 3., 5., 5., 7., 7.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_movehdup_ps() {
        let a = _mm_set_ps(1., 2., 3., 4.);
        let r = _mm_mask_movehdup_ps(a, 0, a);
        assert_eq_m128(r, a);
        let r = _mm_mask_movehdup_ps(a, 0b00001111, a);
        let e = _mm_set_ps(1., 1., 3., 3.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_movehdup_ps() {
        let a = _mm_set_ps(1., 2., 3., 4.);
        let r = _mm_maskz_movehdup_ps(0, a);
        assert_eq_m128(r, _mm_setzero_ps());
        let r = _mm_maskz_movehdup_ps(0b00001111, a);
        let e = _mm_set_ps(1., 1., 3., 3.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_inserti32x4() {
        let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
        let b = _mm_setr_epi32(17, 18, 19, 20);
        let r = _mm512_inserti32x4::<0>(a, b);
        let e = _mm512_setr_epi32(17, 18, 19, 20, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_inserti32x4() {
        let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
        let b = _mm_setr_epi32(17, 18, 19, 20);
        let r = _mm512_mask_inserti32x4::<0>(a, 0, a, b);
        assert_eq_m512i(r, a);
        let r = _mm512_mask_inserti32x4::<0>(a, 0b11111111_11111111, a, b);
        let e = _mm512_setr_epi32(17, 18, 19, 20, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_inserti32x4() {
        let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
        let b = _mm_setr_epi32(17, 18, 19, 20);
        let r = _mm512_maskz_inserti32x4::<0>(0, a, b);
        assert_eq_m512i(r, _mm512_setzero_si512());
        let r = _mm512_maskz_inserti32x4::<0>(0b00000000_11111111, a, b);
        let e = _mm512_setr_epi32(17, 18, 19, 20, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 0);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_inserti32x4() {
        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
        let b = _mm_set_epi32(17, 18, 19, 20);
        let r = _mm256_inserti32x4::<1>(a, b);
        let e = _mm256_set_epi32(17, 18, 19, 20, 5, 6, 7, 8);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_inserti32x4() {
        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
        let b = _mm_set_epi32(17, 18, 19, 20);
        let r = _mm256_mask_inserti32x4::<0>(a, 0, a, b);
        assert_eq_m256i(r, a);
        let r = _mm256_mask_inserti32x4::<1>(a, 0b11111111, a, b);
        let e = _mm256_set_epi32(17, 18, 19, 20, 5, 6, 7, 8);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_inserti32x4() {
        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
        let b = _mm_set_epi32(17, 18, 19, 20);
        let r = _mm256_maskz_inserti32x4::<0>(0, a, b);
        assert_eq_m256i(r, _mm256_setzero_si256());
        let r = _mm256_maskz_inserti32x4::<1>(0b11111111, a, b);
        let e = _mm256_set_epi32(17, 18, 19, 20, 5, 6, 7, 8);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_insertf32x4() {
        let a = _mm512_setr_ps(
            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
        );
        let b = _mm_setr_ps(17., 18., 19., 20.);
        let r = _mm512_insertf32x4::<0>(a, b);
        let e = _mm512_setr_ps(
            17., 18., 19., 20., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_insertf32x4() {
        let a = _mm512_setr_ps(
            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
        );
        let b = _mm_setr_ps(17., 18., 19., 20.);
        let r = _mm512_mask_insertf32x4::<0>(a, 0, a, b);
        assert_eq_m512(r, a);
        let r = _mm512_mask_insertf32x4::<0>(a, 0b11111111_11111111, a, b);
        let e = _mm512_setr_ps(
            17., 18., 19., 20., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_insertf32x4() {
        let a = _mm512_setr_ps(
            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
        );
        let b = _mm_setr_ps(17., 18., 19., 20.);
        let r = _mm512_maskz_insertf32x4::<0>(0, a, b);
        assert_eq_m512(r, _mm512_setzero_ps());
        let r = _mm512_maskz_insertf32x4::<0>(0b00000000_11111111, a, b);
        let e = _mm512_setr_ps(
            17., 18., 19., 20., 5., 6., 7., 8., 0., 0., 0., 0., 0., 0., 0., 0.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_insertf32x4() {
        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
        let b = _mm_set_ps(17., 18., 19., 20.);
        let r = _mm256_insertf32x4::<1>(a, b);
        let e = _mm256_set_ps(17., 18., 19., 20., 5., 6., 7., 8.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_insertf32x4() {
        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
        let b = _mm_set_ps(17., 18., 19., 20.);
        let r = _mm256_mask_insertf32x4::<0>(a, 0, a, b);
        assert_eq_m256(r, a);
        let r = _mm256_mask_insertf32x4::<1>(a, 0b11111111, a, b);
        let e = _mm256_set_ps(17., 18., 19., 20., 5., 6., 7., 8.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_insertf32x4() {
        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
        let b = _mm_set_ps(17., 18., 19., 20.);
        let r = _mm256_maskz_insertf32x4::<0>(0, a, b);
        assert_eq_m256(r, _mm256_setzero_ps());
        let r = _mm256_maskz_insertf32x4::<1>(0b11111111, a, b);
        let e = _mm256_set_ps(17., 18., 19., 20., 5., 6., 7., 8.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_castps128_ps512() {
        let a = _mm_setr_ps(17., 18., 19., 20.);
        let r = _mm512_castps128_ps512(a);
        let e = _mm512_setr_ps(
            17., 18., 19., 20., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_castps256_ps512() {
        let a = _mm256_setr_ps(17., 18., 19., 20., 21., 22., 23., 24.);
        let r = _mm512_castps256_ps512(a);
        let e = _mm512_setr_ps(
            17., 18., 19., 20., 21., 22., 23., 24., -1., -1., -1., -1., -1., -1., -1., -1.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_zextps128_ps512() {
        let a = _mm_setr_ps(17., 18., 19., 20.);
        let r = _mm512_zextps128_ps512(a);
        let e = _mm512_setr_ps(
            17., 18., 19., 20., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_zextps256_ps512() {
        let a = _mm256_setr_ps(17., 18., 19., 20., 21., 22., 23., 24.);
        let r = _mm512_zextps256_ps512(a);
        let e = _mm512_setr_ps(
            17., 18., 19., 20., 21., 22., 23., 24., 0., 0., 0., 0., 0., 0., 0., 0.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_castps512_ps128() {
        let a = _mm512_setr_ps(
            17., 18., 19., 20., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
        );
        let r = _mm512_castps512_ps128(a);
        let e = _mm_setr_ps(17., 18., 19., 20.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_castps512_ps256() {
        let a = _mm512_setr_ps(
            17., 18., 19., 20., 21., 22., 23., 24., -1., -1., -1., -1., -1., -1., -1., -1.,
        );
        let r = _mm512_castps512_ps256(a);
        let e = _mm256_setr_ps(17., 18., 19., 20., 21., 22., 23., 24.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_castps_pd() {
        let a = _mm512_set1_ps(1.);
        let r = _mm512_castps_pd(a);
        let e = _mm512_set1_pd(0.007812501848093234);
        assert_eq_m512d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_castps_si512() {
        let a = _mm512_set1_ps(1.);
        let r = _mm512_castps_si512(a);
        let e = _mm512_set1_epi32(1065353216);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_broadcastd_epi32() {
        let a = _mm_set_epi32(17, 18, 19, 20);
        let r = _mm512_broadcastd_epi32(a);
        let e = _mm512_set1_epi32(20);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_broadcastd_epi32() {
        let src = _mm512_set1_epi32(20);
        let a = _mm_set_epi32(17, 18, 19, 20);
        let r = _mm512_mask_broadcastd_epi32(src, 0, a);
        assert_eq_m512i(r, src);
        let r = _mm512_mask_broadcastd_epi32(src, 0b11111111_11111111, a);
        let e = _mm512_set1_epi32(20);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_broadcastd_epi32() {
        let a = _mm_set_epi32(17, 18, 19, 20);
        let r = _mm512_maskz_broadcastd_epi32(0, a);
        assert_eq_m512i(r, _mm512_setzero_si512());
        let r = _mm512_maskz_broadcastd_epi32(0b00000000_11111111, a);
        let e = _mm512_setr_epi32(20, 20, 20, 20, 20, 20, 20, 20, 0, 0, 0, 0, 0, 0, 0, 0);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_broadcastd_epi32() {
        let src = _mm256_set1_epi32(20);
        let a = _mm_set_epi32(17, 18, 19, 20);
        let r = _mm256_mask_broadcastd_epi32(src, 0, a);
        assert_eq_m256i(r, src);
        let r = _mm256_mask_broadcastd_epi32(src, 0b11111111, a);
        let e = _mm256_set1_epi32(20);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_broadcastd_epi32() {
        let a = _mm_set_epi32(17, 18, 19, 20);
        let r = _mm256_maskz_broadcastd_epi32(0, a);
        assert_eq_m256i(r, _mm256_setzero_si256());
        let r = _mm256_maskz_broadcastd_epi32(0b11111111, a);
        let e = _mm256_set1_epi32(20);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_broadcastd_epi32() {
        let src = _mm_set1_epi32(20);
        let a = _mm_set_epi32(17, 18, 19, 20);
        let r = _mm_mask_broadcastd_epi32(src, 0, a);
        assert_eq_m128i(r, src);
        let r = _mm_mask_broadcastd_epi32(src, 0b00001111, a);
        let e = _mm_set1_epi32(20);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_broadcastd_epi32() {
        let a = _mm_set_epi32(17, 18, 19, 20);
        let r = _mm_maskz_broadcastd_epi32(0, a);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm_maskz_broadcastd_epi32(0b00001111, a);
        let e = _mm_set1_epi32(20);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_broadcastss_ps() {
        let a = _mm_set_ps(17., 18., 19., 20.);
        let r = _mm512_broadcastss_ps(a);
        let e = _mm512_set1_ps(20.);
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_broadcastss_ps() {
        let src = _mm512_set1_ps(20.);
        let a = _mm_set_ps(17., 18., 19., 20.);
        let r = _mm512_mask_broadcastss_ps(src, 0, a);
        assert_eq_m512(r, src);
        let r = _mm512_mask_broadcastss_ps(src, 0b11111111_11111111, a);
        let e = _mm512_set1_ps(20.);
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_broadcastss_ps() {
        let a = _mm_set_ps(17., 18., 19., 20.);
        let r = _mm512_maskz_broadcastss_ps(0, a);
        assert_eq_m512(r, _mm512_setzero_ps());
        let r = _mm512_maskz_broadcastss_ps(0b00000000_11111111, a);
        let e = _mm512_setr_ps(
            20., 20., 20., 20., 20., 20., 20., 20., 0., 0., 0., 0., 0., 0., 0., 0.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_broadcastss_ps() {
        let src = _mm256_set1_ps(20.);
        let a = _mm_set_ps(17., 18., 19., 20.);
        let r = _mm256_mask_broadcastss_ps(src, 0, a);
        assert_eq_m256(r, src);
        let r = _mm256_mask_broadcastss_ps(src, 0b11111111, a);
        let e = _mm256_set1_ps(20.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_broadcastss_ps() {
        let a = _mm_set_ps(17., 18., 19., 20.);
        let r = _mm256_maskz_broadcastss_ps(0, a);
        assert_eq_m256(r, _mm256_setzero_ps());
        let r = _mm256_maskz_broadcastss_ps(0b11111111, a);
        let e = _mm256_set1_ps(20.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_broadcastss_ps() {
        let src = _mm_set1_ps(20.);
        let a = _mm_set_ps(17., 18., 19., 20.);
        let r = _mm_mask_broadcastss_ps(src, 0, a);
        assert_eq_m128(r, src);
        let r = _mm_mask_broadcastss_ps(src, 0b00001111, a);
        let e = _mm_set1_ps(20.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_broadcastss_ps() {
        let a = _mm_set_ps(17., 18., 19., 20.);
        let r = _mm_maskz_broadcastss_ps(0, a);
        assert_eq_m128(r, _mm_setzero_ps());
        let r = _mm_maskz_broadcastss_ps(0b00001111, a);
        let e = _mm_set1_ps(20.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_broadcast_i32x4() {
        let a = _mm_set_epi32(17, 18, 19, 20);
        let r = _mm512_broadcast_i32x4(a);
        let e = _mm512_set_epi32(
            17, 18, 19, 20, 17, 18, 19, 20, 17, 18, 19, 20, 17, 18, 19, 20,
        );
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_broadcast_i32x4() {
        let src = _mm512_set1_epi32(20);
        let a = _mm_set_epi32(17, 18, 19, 20);
        let r = _mm512_mask_broadcast_i32x4(src, 0, a);
        assert_eq_m512i(r, src);
        let r = _mm512_mask_broadcast_i32x4(src, 0b11111111_11111111, a);
        let e = _mm512_set_epi32(
            17, 18, 19, 20, 17, 18, 19, 20, 17, 18, 19, 20, 17, 18, 19, 20,
        );
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_broadcast_i32x4() {
        let a = _mm_set_epi32(17, 18, 19, 20);
        let r = _mm512_maskz_broadcast_i32x4(0, a);
        assert_eq_m512i(r, _mm512_setzero_si512());
        let r = _mm512_maskz_broadcast_i32x4(0b00000000_11111111, a);
        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 17, 18, 19, 20, 17, 18, 19, 20);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_broadcast_i32x4() {
        let a = _mm_set_epi32(17, 18, 19, 20);
        let r = _mm256_broadcast_i32x4(a);
        let e = _mm256_set_epi32(17, 18, 19, 20, 17, 18, 19, 20);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_broadcast_i32x4() {
        let src = _mm256_set1_epi32(20);
        let a = _mm_set_epi32(17, 18, 19, 20);
        let r = _mm256_mask_broadcast_i32x4(src, 0, a);
        assert_eq_m256i(r, src);
        let r = _mm256_mask_broadcast_i32x4(src, 0b11111111, a);
        let e = _mm256_set_epi32(17, 18, 19, 20, 17, 18, 19, 20);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_broadcast_i32x4() {
        let a = _mm_set_epi32(17, 18, 19, 20);
        let r = _mm256_maskz_broadcast_i32x4(0, a);
        assert_eq_m256i(r, _mm256_setzero_si256());
        let r = _mm256_maskz_broadcast_i32x4(0b11111111, a);
        let e = _mm256_set_epi32(17, 18, 19, 20, 17, 18, 19, 20);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_broadcast_f32x4() {
        let a = _mm_set_ps(17., 18., 19., 20.);
        let r = _mm512_broadcast_f32x4(a);
        let e = _mm512_set_ps(
            17., 18., 19., 20., 17., 18., 19., 20., 17., 18., 19., 20., 17., 18., 19., 20.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_broadcast_f32x4() {
        let src = _mm512_set1_ps(20.);
        let a = _mm_set_ps(17., 18., 19., 20.);
        let r = _mm512_mask_broadcast_f32x4(src, 0, a);
        assert_eq_m512(r, src);
        let r = _mm512_mask_broadcast_f32x4(src, 0b11111111_11111111, a);
        let e = _mm512_set_ps(
            17., 18., 19., 20., 17., 18., 19., 20., 17., 18., 19., 20., 17., 18., 19., 20.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_broadcast_f32x4() {
        let a = _mm_set_ps(17., 18., 19., 20.);
        let r = _mm512_maskz_broadcast_f32x4(0, a);
        assert_eq_m512(r, _mm512_setzero_ps());
        let r = _mm512_maskz_broadcast_f32x4(0b00000000_11111111, a);
        let e = _mm512_set_ps(
            0., 0., 0., 0., 0., 0., 0., 0., 17., 18., 19., 20., 17., 18., 19., 20.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_broadcast_f32x4() {
        let a = _mm_set_ps(17., 18., 19., 20.);
        let r = _mm256_broadcast_f32x4(a);
        let e = _mm256_set_ps(17., 18., 19., 20., 17., 18., 19., 20.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_broadcast_f32x4() {
        let src = _mm256_set1_ps(20.);
        let a = _mm_set_ps(17., 18., 19., 20.);
        let r = _mm256_mask_broadcast_f32x4(src, 0, a);
        assert_eq_m256(r, src);
        let r = _mm256_mask_broadcast_f32x4(src, 0b11111111, a);
        let e = _mm256_set_ps(17., 18., 19., 20., 17., 18., 19., 20.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_broadcast_f32x4() {
        let a = _mm_set_ps(17., 18., 19., 20.);
        let r = _mm256_maskz_broadcast_f32x4(0, a);
        assert_eq_m256(r, _mm256_setzero_ps());
        let r = _mm256_maskz_broadcast_f32x4(0b11111111, a);
        let e = _mm256_set_ps(17., 18., 19., 20., 17., 18., 19., 20.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_blend_epi32() {
        let a = _mm512_set1_epi32(1);
        let b = _mm512_set1_epi32(2);
        let r = _mm512_mask_blend_epi32(0b11111111_00000000, a, b);
        let e = _mm512_set_epi32(2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_blend_epi32() {
        let a = _mm256_set1_epi32(1);
        let b = _mm256_set1_epi32(2);
        let r = _mm256_mask_blend_epi32(0b11111111, a, b);
        let e = _mm256_set1_epi32(2);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_blend_epi32() {
        let a = _mm_set1_epi32(1);
        let b = _mm_set1_epi32(2);
        let r = _mm_mask_blend_epi32(0b00001111, a, b);
        let e = _mm_set1_epi32(2);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_blend_ps() {
        let a = _mm512_set1_ps(1.);
        let b = _mm512_set1_ps(2.);
        let r = _mm512_mask_blend_ps(0b11111111_00000000, a, b);
        let e = _mm512_set_ps(
            2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_blend_ps() {
        let a = _mm256_set1_ps(1.);
        let b = _mm256_set1_ps(2.);
        let r = _mm256_mask_blend_ps(0b11111111, a, b);
        let e = _mm256_set1_ps(2.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_blend_ps() {
        let a = _mm_set1_ps(1.);
        let b = _mm_set1_ps(2.);
        let r = _mm_mask_blend_ps(0b00001111, a, b);
        let e = _mm_set1_ps(2.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_unpackhi_epi32() {
        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
        let b = _mm512_set_epi32(
            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
        );
        let r = _mm512_unpackhi_epi32(a, b);
        let e = _mm512_set_epi32(17, 1, 18, 2, 21, 5, 22, 6, 25, 9, 26, 10, 29, 13, 30, 14);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_unpackhi_epi32() {
        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
        let b = _mm512_set_epi32(
            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
        );
        let r = _mm512_mask_unpackhi_epi32(a, 0, a, b);
        assert_eq_m512i(r, a);
        let r = _mm512_mask_unpackhi_epi32(a, 0b11111111_11111111, a, b);
        let e = _mm512_set_epi32(17, 1, 18, 2, 21, 5, 22, 6, 25, 9, 26, 10, 29, 13, 30, 14);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_unpackhi_epi32() {
        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
        let b = _mm512_set_epi32(
            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
        );
        let r = _mm512_maskz_unpackhi_epi32(0, a, b);
        assert_eq_m512i(r, _mm512_setzero_si512());
        let r = _mm512_maskz_unpackhi_epi32(0b00000000_11111111, a, b);
        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 25, 9, 26, 10, 29, 13, 30, 14);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_unpackhi_epi32() {
        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
        let b = _mm256_set_epi32(17, 18, 19, 20, 21, 22, 23, 24);
        let r = _mm256_mask_unpackhi_epi32(a, 0, a, b);
        assert_eq_m256i(r, a);
        let r = _mm256_mask_unpackhi_epi32(a, 0b11111111, a, b);
        let e = _mm256_set_epi32(17, 1, 18, 2, 21, 5, 22, 6);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_unpackhi_epi32() {
        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
        let b = _mm256_set_epi32(17, 18, 19, 20, 21, 22, 23, 24);
        let r = _mm256_maskz_unpackhi_epi32(0, a, b);
        assert_eq_m256i(r, _mm256_setzero_si256());
        let r = _mm256_maskz_unpackhi_epi32(0b11111111, a, b);
        let e = _mm256_set_epi32(17, 1, 18, 2, 21, 5, 22, 6);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_unpackhi_epi32() {
        let a = _mm_set_epi32(1, 2, 3, 4);
        let b = _mm_set_epi32(17, 18, 19, 20);
        let r = _mm_mask_unpackhi_epi32(a, 0, a, b);
        assert_eq_m128i(r, a);
        let r = _mm_mask_unpackhi_epi32(a, 0b00001111, a, b);
        let e = _mm_set_epi32(17, 1, 18, 2);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_unpackhi_epi32() {
        let a = _mm_set_epi32(1, 2, 3, 4);
        let b = _mm_set_epi32(17, 18, 19, 20);
        let r = _mm_maskz_unpackhi_epi32(0, a, b);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm_maskz_unpackhi_epi32(0b00001111, a, b);
        let e = _mm_set_epi32(17, 1, 18, 2);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_unpackhi_ps() {
        let a = _mm512_set_ps(
            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
        );
        let b = _mm512_set_ps(
            17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
        );
        let r = _mm512_unpackhi_ps(a, b);
        let e = _mm512_set_ps(
            17., 1., 18., 2., 21., 5., 22., 6., 25., 9., 26., 10., 29., 13., 30., 14.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_unpackhi_ps() {
        let a = _mm512_set_ps(
            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
        );
        let b = _mm512_set_ps(
            17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
        );
        let r = _mm512_mask_unpackhi_ps(a, 0, a, b);
        assert_eq_m512(r, a);
        let r = _mm512_mask_unpackhi_ps(a, 0b11111111_11111111, a, b);
        let e = _mm512_set_ps(
            17., 1., 18., 2., 21., 5., 22., 6., 25., 9., 26., 10., 29., 13., 30., 14.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_unpackhi_ps() {
        let a = _mm512_set_ps(
            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
        );
        let b = _mm512_set_ps(
            17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
        );
        let r = _mm512_maskz_unpackhi_ps(0, a, b);
        assert_eq_m512(r, _mm512_setzero_ps());
        let r = _mm512_maskz_unpackhi_ps(0b00000000_11111111, a, b);
        let e = _mm512_set_ps(
            0., 0., 0., 0., 0., 0., 0., 0., 25., 9., 26., 10., 29., 13., 30., 14.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_unpackhi_ps() {
        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
        let b = _mm256_set_ps(17., 18., 19., 20., 21., 22., 23., 24.);
        let r = _mm256_mask_unpackhi_ps(a, 0, a, b);
        assert_eq_m256(r, a);
        let r = _mm256_mask_unpackhi_ps(a, 0b11111111, a, b);
        let e = _mm256_set_ps(17., 1., 18., 2., 21., 5., 22., 6.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_unpackhi_ps() {
        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
        let b = _mm256_set_ps(17., 18., 19., 20., 21., 22., 23., 24.);
        let r = _mm256_maskz_unpackhi_ps(0, a, b);
        assert_eq_m256(r, _mm256_setzero_ps());
        let r = _mm256_maskz_unpackhi_ps(0b11111111, a, b);
        let e = _mm256_set_ps(17., 1., 18., 2., 21., 5., 22., 6.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_unpackhi_ps() {
        let a = _mm_set_ps(1., 2., 3., 4.);
        let b = _mm_set_ps(17., 18., 19., 20.);
        let r = _mm_mask_unpackhi_ps(a, 0, a, b);
        assert_eq_m128(r, a);
        let r = _mm_mask_unpackhi_ps(a, 0b00001111, a, b);
        let e = _mm_set_ps(17., 1., 18., 2.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_unpackhi_ps() {
        let a = _mm_set_ps(1., 2., 3., 4.);
        let b = _mm_set_ps(17., 18., 19., 20.);
        let r = _mm_maskz_unpackhi_ps(0, a, b);
        assert_eq_m128(r, _mm_setzero_ps());
        let r = _mm_maskz_unpackhi_ps(0b00001111, a, b);
        let e = _mm_set_ps(17., 1., 18., 2.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_unpacklo_epi32() {
        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
        let b = _mm512_set_epi32(
            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
        );
        let r = _mm512_unpacklo_epi32(a, b);
        let e = _mm512_set_epi32(19, 3, 20, 4, 23, 7, 24, 8, 27, 11, 28, 12, 31, 15, 32, 16);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_unpacklo_epi32() {
        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
        let b = _mm512_set_epi32(
            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
        );
        let r = _mm512_mask_unpacklo_epi32(a, 0, a, b);
        assert_eq_m512i(r, a);
        let r = _mm512_mask_unpacklo_epi32(a, 0b11111111_11111111, a, b);
        let e = _mm512_set_epi32(19, 3, 20, 4, 23, 7, 24, 8, 27, 11, 28, 12, 31, 15, 32, 16);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_unpacklo_epi32() {
        let a = _mm512_set_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
        let b = _mm512_set_epi32(
            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
        );
        let r = _mm512_maskz_unpacklo_epi32(0, a, b);
        assert_eq_m512i(r, _mm512_setzero_si512());
        let r = _mm512_maskz_unpacklo_epi32(0b00000000_11111111, a, b);
        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 27, 11, 28, 12, 31, 15, 32, 16);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_unpacklo_epi32() {
        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
        let b = _mm256_set_epi32(17, 18, 19, 20, 21, 22, 23, 24);
        let r = _mm256_mask_unpacklo_epi32(a, 0, a, b);
        assert_eq_m256i(r, a);
        let r = _mm256_mask_unpacklo_epi32(a, 0b11111111, a, b);
        let e = _mm256_set_epi32(19, 3, 20, 4, 23, 7, 24, 8);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_unpacklo_epi32() {
        let a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
        let b = _mm256_set_epi32(17, 18, 19, 20, 21, 22, 23, 24);
        let r = _mm256_maskz_unpacklo_epi32(0, a, b);
        assert_eq_m256i(r, _mm256_setzero_si256());
        let r = _mm256_maskz_unpacklo_epi32(0b11111111, a, b);
        let e = _mm256_set_epi32(19, 3, 20, 4, 23, 7, 24, 8);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_unpacklo_epi32() {
        let a = _mm_set_epi32(1, 2, 3, 4);
        let b = _mm_set_epi32(17, 18, 19, 20);
        let r = _mm_mask_unpacklo_epi32(a, 0, a, b);
        assert_eq_m128i(r, a);
        let r = _mm_mask_unpacklo_epi32(a, 0b00001111, a, b);
        let e = _mm_set_epi32(19, 3, 20, 4);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_unpacklo_epi32() {
        let a = _mm_set_epi32(1, 2, 3, 4);
        let b = _mm_set_epi32(17, 18, 19, 20);
        let r = _mm_maskz_unpacklo_epi32(0, a, b);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm_maskz_unpacklo_epi32(0b00001111, a, b);
        let e = _mm_set_epi32(19, 3, 20, 4);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_unpacklo_ps() {
        let a = _mm512_set_ps(
            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
        );
        let b = _mm512_set_ps(
            17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
        );
        let r = _mm512_unpacklo_ps(a, b);
        let e = _mm512_set_ps(
            19., 3., 20., 4., 23., 7., 24., 8., 27., 11., 28., 12., 31., 15., 32., 16.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_unpacklo_ps() {
        let a = _mm512_set_ps(
            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
        );
        let b = _mm512_set_ps(
            17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
        );
        let r = _mm512_mask_unpacklo_ps(a, 0, a, b);
        assert_eq_m512(r, a);
        let r = _mm512_mask_unpacklo_ps(a, 0b11111111_11111111, a, b);
        let e = _mm512_set_ps(
            19., 3., 20., 4., 23., 7., 24., 8., 27., 11., 28., 12., 31., 15., 32., 16.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_unpacklo_ps() {
        let a = _mm512_set_ps(
            1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
        );
        let b = _mm512_set_ps(
            17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., 30., 31., 32.,
        );
        let r = _mm512_maskz_unpacklo_ps(0, a, b);
        assert_eq_m512(r, _mm512_setzero_ps());
        let r = _mm512_maskz_unpacklo_ps(0b00000000_11111111, a, b);
        let e = _mm512_set_ps(
            0., 0., 0., 0., 0., 0., 0., 0., 27., 11., 28., 12., 31., 15., 32., 16.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_unpacklo_ps() {
        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
        let b = _mm256_set_ps(17., 18., 19., 20., 21., 22., 23., 24.);
        let r = _mm256_mask_unpacklo_ps(a, 0, a, b);
        assert_eq_m256(r, a);
        let r = _mm256_mask_unpacklo_ps(a, 0b11111111, a, b);
        let e = _mm256_set_ps(19., 3., 20., 4., 23., 7., 24., 8.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_unpacklo_ps() {
        let a = _mm256_set_ps(1., 2., 3., 4., 5., 6., 7., 8.);
        let b = _mm256_set_ps(17., 18., 19., 20., 21., 22., 23., 24.);
        let r = _mm256_maskz_unpacklo_ps(0, a, b);
        assert_eq_m256(r, _mm256_setzero_ps());
        let r = _mm256_maskz_unpacklo_ps(0b11111111, a, b);
        let e = _mm256_set_ps(19., 3., 20., 4., 23., 7., 24., 8.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_unpacklo_ps() {
        let a = _mm_set_ps(1., 2., 3., 4.);
        let b = _mm_set_ps(17., 18., 19., 20.);
        let r = _mm_mask_unpacklo_ps(a, 0, a, b);
        assert_eq_m128(r, a);
        let r = _mm_mask_unpacklo_ps(a, 0b00001111, a, b);
        let e = _mm_set_ps(19., 3., 20., 4.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_unpacklo_ps() {
        let a = _mm_set_ps(1., 2., 3., 4.);
        let b = _mm_set_ps(17., 18., 19., 20.);
        let r = _mm_maskz_unpacklo_ps(0, a, b);
        assert_eq_m128(r, _mm_setzero_ps());
        let r = _mm_maskz_unpacklo_ps(0b00001111, a, b);
        let e = _mm_set_ps(19., 3., 20., 4.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_alignr_epi32() {
        let a = _mm512_set_epi32(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
        let b = _mm512_set_epi32(
            32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
        );
        let r = _mm512_alignr_epi32::<0>(a, b);
        assert_eq_m512i(r, b);
        let r = _mm512_alignr_epi32::<16>(a, b);
        assert_eq_m512i(r, b);
        let r = _mm512_alignr_epi32::<1>(a, b);
        let e = _mm512_set_epi32(
            1, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18,
        );
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_alignr_epi32() {
        let a = _mm512_set_epi32(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
        let b = _mm512_set_epi32(
            32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
        );
        let r = _mm512_mask_alignr_epi32::<1>(a, 0, a, b);
        assert_eq_m512i(r, a);
        let r = _mm512_mask_alignr_epi32::<1>(a, 0b11111111_11111111, a, b);
        let e = _mm512_set_epi32(
            1, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18,
        );
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_alignr_epi32() {
        let a = _mm512_set_epi32(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
        let b = _mm512_set_epi32(
            32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
        );
        let r = _mm512_maskz_alignr_epi32::<1>(0, a, b);
        assert_eq_m512i(r, _mm512_setzero_si512());
        let r = _mm512_maskz_alignr_epi32::<1>(0b00000000_11111111, a, b);
        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 25, 24, 23, 22, 21, 20, 19, 18);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_alignr_epi32() {
        let a = _mm256_set_epi32(8, 7, 6, 5, 4, 3, 2, 1);
        let b = _mm256_set_epi32(16, 15, 14, 13, 12, 11, 10, 9);
        let r = _mm256_alignr_epi32::<0>(a, b);
        assert_eq_m256i(r, b);
        let r = _mm256_alignr_epi32::<1>(a, b);
        let e = _mm256_set_epi32(1, 16, 15, 14, 13, 12, 11, 10);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_alignr_epi32() {
        let a = _mm256_set_epi32(8, 7, 6, 5, 4, 3, 2, 1);
        let b = _mm256_set_epi32(16, 15, 14, 13, 12, 11, 10, 9);
        let r = _mm256_mask_alignr_epi32::<1>(a, 0, a, b);
        assert_eq_m256i(r, a);
        let r = _mm256_mask_alignr_epi32::<1>(a, 0b11111111, a, b);
        let e = _mm256_set_epi32(1, 16, 15, 14, 13, 12, 11, 10);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_alignr_epi32() {
        let a = _mm256_set_epi32(8, 7, 6, 5, 4, 3, 2, 1);
        let b = _mm256_set_epi32(16, 15, 14, 13, 12, 11, 10, 9);
        let r = _mm256_maskz_alignr_epi32::<1>(0, a, b);
        assert_eq_m256i(r, _mm256_setzero_si256());
        let r = _mm256_maskz_alignr_epi32::<1>(0b11111111, a, b);
        let e = _mm256_set_epi32(1, 16, 15, 14, 13, 12, 11, 10);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_alignr_epi32() {
        let a = _mm_set_epi32(4, 3, 2, 1);
        let b = _mm_set_epi32(8, 7, 6, 5);
        let r = _mm_alignr_epi32::<0>(a, b);
        assert_eq_m128i(r, b);
        let r = _mm_alignr_epi32::<1>(a, b);
        let e = _mm_set_epi32(1, 8, 7, 6);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_alignr_epi32() {
        let a = _mm_set_epi32(4, 3, 2, 1);
        let b = _mm_set_epi32(8, 7, 6, 5);
        let r = _mm_mask_alignr_epi32::<1>(a, 0, a, b);
        assert_eq_m128i(r, a);
        let r = _mm_mask_alignr_epi32::<1>(a, 0b00001111, a, b);
        let e = _mm_set_epi32(1, 8, 7, 6);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_alignr_epi32() {
        let a = _mm_set_epi32(4, 3, 2, 1);
        let b = _mm_set_epi32(8, 7, 6, 5);
        let r = _mm_maskz_alignr_epi32::<1>(0, a, b);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm_maskz_alignr_epi32::<1>(0b00001111, a, b);
        let e = _mm_set_epi32(1, 8, 7, 6);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_and_epi32() {
        #[rustfmt::skip]
        let a = _mm512_set_epi32(
            1 << 1 | 1 << 2, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 1 << 1 | 1 << 3,
        );
        #[rustfmt::skip]
        let b = _mm512_set_epi32(
            1 << 1, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 1 << 3 | 1 << 4,
        );
        let r = _mm512_and_epi32(a, b);
        let e = _mm512_set_epi32(1 << 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 << 3);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_and_epi32() {
        #[rustfmt::skip]
        let a = _mm512_set_epi32(
            1 << 1 | 1 << 2, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 1 << 1 | 1 << 3,
        );
        #[rustfmt::skip]
        let b = _mm512_set_epi32(
            1 << 1, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 1 << 3 | 1 << 4,
        );
        let r = _mm512_mask_and_epi32(a, 0, a, b);
        assert_eq_m512i(r, a);
        let r = _mm512_mask_and_epi32(a, 0b01111111_11111111, a, b);
        #[rustfmt::skip]
        let e = _mm512_set_epi32(
            1 << 1 | 1 << 2, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 1 << 3,
        );
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_and_epi32() {
        #[rustfmt::skip]
        let a = _mm512_set_epi32(
            1 << 1 | 1 << 2, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 1 << 1 | 1 << 3,
        );
        #[rustfmt::skip]
        let b = _mm512_set_epi32(
            1 << 1, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 1 << 3 | 1 << 4,
        );
        let r = _mm512_maskz_and_epi32(0, a, b);
        assert_eq_m512i(r, _mm512_setzero_si512());
        let r = _mm512_maskz_and_epi32(0b00000000_11111111, a, b);
        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 << 3);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_and_epi32() {
        let a = _mm256_set1_epi32(1 << 1 | 1 << 2);
        let b = _mm256_set1_epi32(1 << 1);
        let r = _mm256_mask_and_epi32(a, 0, a, b);
        assert_eq_m256i(r, a);
        let r = _mm256_mask_and_epi32(a, 0b11111111, a, b);
        let e = _mm256_set1_epi32(1 << 1);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_and_epi32() {
        let a = _mm256_set1_epi32(1 << 1 | 1 << 2);
        let b = _mm256_set1_epi32(1 << 1);
        let r = _mm256_maskz_and_epi32(0, a, b);
        assert_eq_m256i(r, _mm256_setzero_si256());
        let r = _mm256_maskz_and_epi32(0b11111111, a, b);
        let e = _mm256_set1_epi32(1 << 1);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_and_epi32() {
        let a = _mm_set1_epi32(1 << 1 | 1 << 2);
        let b = _mm_set1_epi32(1 << 1);
        let r = _mm_mask_and_epi32(a, 0, a, b);
        assert_eq_m128i(r, a);
        let r = _mm_mask_and_epi32(a, 0b00001111, a, b);
        let e = _mm_set1_epi32(1 << 1);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_and_epi32() {
        let a = _mm_set1_epi32(1 << 1 | 1 << 2);
        let b = _mm_set1_epi32(1 << 1);
        let r = _mm_maskz_and_epi32(0, a, b);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm_maskz_and_epi32(0b00001111, a, b);
        let e = _mm_set1_epi32(1 << 1);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_and_si512() {
        #[rustfmt::skip]
        let a = _mm512_set_epi32(
            1 << 1 | 1 << 2, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 1 << 1 | 1 << 3,
        );
        #[rustfmt::skip]
        let b = _mm512_set_epi32(
            1 << 1, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 1 << 3 | 1 << 4,
        );
        let r = _mm512_and_epi32(a, b);
        let e = _mm512_set_epi32(1 << 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 << 3);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_or_epi32() {
        #[rustfmt::skip]
        let a = _mm512_set_epi32(
            1 << 1 | 1 << 2, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 1 << 1 | 1 << 3,
        );
        #[rustfmt::skip]
        let b = _mm512_set_epi32(
            1 << 1, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 1 << 3 | 1 << 4,
        );
        let r = _mm512_or_epi32(a, b);
        #[rustfmt::skip]
        let e = _mm512_set_epi32(
            1 << 1 | 1 << 2, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 1 << 1 | 1 << 3 | 1 << 4,
        );
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_or_epi32() {
        #[rustfmt::skip]
        let a = _mm512_set_epi32(
            1 << 1 | 1 << 2, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 1 << 1 | 1 << 3,
        );
        #[rustfmt::skip]
        let b = _mm512_set_epi32(
            1 << 1, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 1 << 3 | 1 << 4,
        );
        let r = _mm512_mask_or_epi32(a, 0, a, b);
        assert_eq_m512i(r, a);
        let r = _mm512_mask_or_epi32(a, 0b11111111_11111111, a, b);
        #[rustfmt::skip]
        let e = _mm512_set_epi32(
            1 << 1 | 1 << 2, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 1 << 1 | 1 << 3 | 1 << 4,
        );
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_or_epi32() {
        #[rustfmt::skip]
        let a = _mm512_set_epi32(
            1 << 1 | 1 << 2, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 1 << 1 | 1 << 3,
        );
        #[rustfmt::skip]
        let b = _mm512_set_epi32(
            1 << 1, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 1 << 3 | 1 << 4,
        );
        let r = _mm512_maskz_or_epi32(0, a, b);
        assert_eq_m512i(r, _mm512_setzero_si512());
        let r = _mm512_maskz_or_epi32(0b00000000_11111111, a, b);
        #[rustfmt::skip]
        let e = _mm512_set_epi32(
            0, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 1 << 1 | 1 << 3 | 1 << 4,
        );
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_or_epi32() {
        let a = _mm256_set1_epi32(1 << 1 | 1 << 2);
        let b = _mm256_set1_epi32(1 << 1);
        let r = _mm256_or_epi32(a, b);
        let e = _mm256_set1_epi32(1 << 1 | 1 << 2);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_or_epi32() {
        let a = _mm256_set1_epi32(1 << 1 | 1 << 2);
        let b = _mm256_set1_epi32(1 << 1);
        let r = _mm256_mask_or_epi32(a, 0, a, b);
        assert_eq_m256i(r, a);
        let r = _mm256_mask_or_epi32(a, 0b11111111, a, b);
        let e = _mm256_set1_epi32(1 << 1 | 1 << 2);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_or_epi32() {
        let a = _mm256_set1_epi32(1 << 1 | 1 << 2);
        let b = _mm256_set1_epi32(1 << 1);
        let r = _mm256_maskz_or_epi32(0, a, b);
        assert_eq_m256i(r, _mm256_setzero_si256());
        let r = _mm256_maskz_or_epi32(0b11111111, a, b);
        let e = _mm256_set1_epi32(1 << 1 | 1 << 2);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_or_epi32() {
        let a = _mm_set1_epi32(1 << 1 | 1 << 2);
        let b = _mm_set1_epi32(1 << 1);
        let r = _mm_or_epi32(a, b);
        let e = _mm_set1_epi32(1 << 1 | 1 << 2);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_or_epi32() {
        let a = _mm_set1_epi32(1 << 1 | 1 << 2);
        let b = _mm_set1_epi32(1 << 1);
        let r = _mm_mask_or_epi32(a, 0, a, b);
        assert_eq_m128i(r, a);
        let r = _mm_mask_or_epi32(a, 0b00001111, a, b);
        let e = _mm_set1_epi32(1 << 1 | 1 << 2);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_or_epi32() {
        let a = _mm_set1_epi32(1 << 1 | 1 << 2);
        let b = _mm_set1_epi32(1 << 1);
        let r = _mm_maskz_or_epi32(0, a, b);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm_maskz_or_epi32(0b00001111, a, b);
        let e = _mm_set1_epi32(1 << 1 | 1 << 2);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_or_si512() {
        #[rustfmt::skip]
        let a = _mm512_set_epi32(
            1 << 1 | 1 << 2, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 1 << 1 | 1 << 3,
        );
        #[rustfmt::skip]
        let b = _mm512_set_epi32(
            1 << 1, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 1 << 3 | 1 << 4,
        );
        let r = _mm512_or_epi32(a, b);
        #[rustfmt::skip]
        let e = _mm512_set_epi32(
            1 << 1 | 1 << 2, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 1 << 1 | 1 << 3 | 1 << 4,
        );
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_xor_epi32() {
        #[rustfmt::skip]
        let a = _mm512_set_epi32(
            1 << 1 | 1 << 2, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 1 << 1 | 1 << 3,
        );
        #[rustfmt::skip]
        let b = _mm512_set_epi32(
            1 << 1, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 1 << 3 | 1 << 4,
        );
        let r = _mm512_xor_epi32(a, b);
        #[rustfmt::skip]
        let e = _mm512_set_epi32(
            1 << 2, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 1 << 1 | 1 << 4,
        );
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_xor_epi32() {
        #[rustfmt::skip]
        let a = _mm512_set_epi32(
            1 << 1 | 1 << 2, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 1 << 1 | 1 << 3,
        );
        #[rustfmt::skip]
        let b = _mm512_set_epi32(
            1 << 1, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 1 << 3 | 1 << 4,
        );
        let r = _mm512_mask_xor_epi32(a, 0, a, b);
        assert_eq_m512i(r, a);
        let r = _mm512_mask_xor_epi32(a, 0b01111111_11111111, a, b);
        #[rustfmt::skip]
        let e = _mm512_set_epi32(
            1 << 1 | 1 << 2, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 1 << 1 | 1 << 4,
        );
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_xor_epi32() {
        #[rustfmt::skip]
        let a = _mm512_set_epi32(
            1 << 1 | 1 << 2, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 1 << 1 | 1 << 3,
        );
        #[rustfmt::skip]
        let b = _mm512_set_epi32(
            1 << 1, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 1 << 3 | 1 << 4,
        );
        let r = _mm512_maskz_xor_epi32(0, a, b);
        assert_eq_m512i(r, _mm512_setzero_si512());
        let r = _mm512_maskz_xor_epi32(0b00000000_11111111, a, b);
        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 << 1 | 1 << 4);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_xor_epi32() {
        let a = _mm256_set1_epi32(1 << 1 | 1 << 2);
        let b = _mm256_set1_epi32(1 << 1);
        let r = _mm256_xor_epi32(a, b);
        let e = _mm256_set1_epi32(1 << 2);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_xor_epi32() {
        let a = _mm256_set1_epi32(1 << 1 | 1 << 2);
        let b = _mm256_set1_epi32(1 << 1);
        let r = _mm256_mask_xor_epi32(a, 0, a, b);
        assert_eq_m256i(r, a);
        let r = _mm256_mask_xor_epi32(a, 0b11111111, a, b);
        let e = _mm256_set1_epi32(1 << 2);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_xor_epi32() {
        let a = _mm256_set1_epi32(1 << 1 | 1 << 2);
        let b = _mm256_set1_epi32(1 << 1);
        let r = _mm256_maskz_xor_epi32(0, a, b);
        assert_eq_m256i(r, _mm256_setzero_si256());
        let r = _mm256_maskz_xor_epi32(0b11111111, a, b);
        let e = _mm256_set1_epi32(1 << 2);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_xor_epi32() {
        let a = _mm_set1_epi32(1 << 1 | 1 << 2);
        let b = _mm_set1_epi32(1 << 1);
        let r = _mm_xor_epi32(a, b);
        let e = _mm_set1_epi32(1 << 2);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_xor_epi32() {
        let a = _mm_set1_epi32(1 << 1 | 1 << 2);
        let b = _mm_set1_epi32(1 << 1);
        let r = _mm_mask_xor_epi32(a, 0, a, b);
        assert_eq_m128i(r, a);
        let r = _mm_mask_xor_epi32(a, 0b00001111, a, b);
        let e = _mm_set1_epi32(1 << 2);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_xor_epi32() {
        let a = _mm_set1_epi32(1 << 1 | 1 << 2);
        let b = _mm_set1_epi32(1 << 1);
        let r = _mm_maskz_xor_epi32(0, a, b);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm_maskz_xor_epi32(0b00001111, a, b);
        let e = _mm_set1_epi32(1 << 2);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_xor_si512() {
        #[rustfmt::skip]
        let a = _mm512_set_epi32(
            1 << 1 | 1 << 2, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 1 << 1 | 1 << 3,
        );
        #[rustfmt::skip]
        let b = _mm512_set_epi32(
            1 << 1, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 1 << 3 | 1 << 4,
        );
        let r = _mm512_xor_epi32(a, b);
        #[rustfmt::skip]
        let e = _mm512_set_epi32(
            1 << 2, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 1 << 1 | 1 << 4,
        );
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_andnot_epi32() {
        let a = _mm512_set1_epi32(0);
        let b = _mm512_set1_epi32(1 << 3 | 1 << 4);
        let r = _mm512_andnot_epi32(a, b);
        let e = _mm512_set1_epi32(1 << 3 | 1 << 4);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_andnot_epi32() {
        let a = _mm512_set1_epi32(1 << 1 | 1 << 2);
        let b = _mm512_set1_epi32(1 << 3 | 1 << 4);
        let r = _mm512_mask_andnot_epi32(a, 0, a, b);
        assert_eq_m512i(r, a);
        let r = _mm512_mask_andnot_epi32(a, 0b11111111_11111111, a, b);
        let e = _mm512_set1_epi32(1 << 3 | 1 << 4);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_andnot_epi32() {
        let a = _mm512_set1_epi32(1 << 1 | 1 << 2);
        let b = _mm512_set1_epi32(1 << 3 | 1 << 4);
        let r = _mm512_maskz_andnot_epi32(0, a, b);
        assert_eq_m512i(r, _mm512_setzero_si512());
        let r = _mm512_maskz_andnot_epi32(0b00000000_11111111, a, b);
        #[rustfmt::skip]
        let e = _mm512_set_epi32(
            0, 0, 0, 0,
            0, 0, 0, 0,
            1 << 3 | 1 << 4, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4,
            1 << 3 | 1 << 4, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4, 1 << 3 | 1 << 4,
        );
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_andnot_epi32() {
        let a = _mm256_set1_epi32(1 << 1 | 1 << 2);
        let b = _mm256_set1_epi32(1 << 3 | 1 << 4);
        let r = _mm256_mask_andnot_epi32(a, 0, a, b);
        assert_eq_m256i(r, a);
        let r = _mm256_mask_andnot_epi32(a, 0b11111111, a, b);
        let e = _mm256_set1_epi32(1 << 3 | 1 << 4);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_andnot_epi32() {
        let a = _mm256_set1_epi32(1 << 1 | 1 << 2);
        let b = _mm256_set1_epi32(1 << 3 | 1 << 4);
        let r = _mm256_maskz_andnot_epi32(0, a, b);
        assert_eq_m256i(r, _mm256_setzero_si256());
        let r = _mm256_maskz_andnot_epi32(0b11111111, a, b);
        let e = _mm256_set1_epi32(1 << 3 | 1 << 4);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_andnot_epi32() {
        let a = _mm_set1_epi32(1 << 1 | 1 << 2);
        let b = _mm_set1_epi32(1 << 3 | 1 << 4);
        let r = _mm_mask_andnot_epi32(a, 0, a, b);
        assert_eq_m128i(r, a);
        let r = _mm_mask_andnot_epi32(a, 0b00001111, a, b);
        let e = _mm_set1_epi32(1 << 3 | 1 << 4);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_andnot_epi32() {
        let a = _mm_set1_epi32(1 << 1 | 1 << 2);
        let b = _mm_set1_epi32(1 << 3 | 1 << 4);
        let r = _mm_maskz_andnot_epi32(0, a, b);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm_maskz_andnot_epi32(0b00001111, a, b);
        let e = _mm_set1_epi32(1 << 3 | 1 << 4);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_kand() {
        let a: u16 = 0b11001100_00110011;
        let b: u16 = 0b11001100_00110011;
        let r = _mm512_kand(a, b);
        let e: u16 = 0b11001100_00110011;
        assert_eq!(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_kand_mask16() {
        let a: u16 = 0b11001100_00110011;
        let b: u16 = 0b11001100_00110011;
        let r = _kand_mask16(a, b);
        let e: u16 = 0b11001100_00110011;
        assert_eq!(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_kor() {
        let a: u16 = 0b11001100_00110011;
        let b: u16 = 0b00101110_00001011;
        let r = _mm512_kor(a, b);
        let e: u16 = 0b11101110_00111011;
        assert_eq!(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_kor_mask16() {
        let a: u16 = 0b11001100_00110011;
        let b: u16 = 0b00101110_00001011;
        let r = _kor_mask16(a, b);
        let e: u16 = 0b11101110_00111011;
        assert_eq!(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_kxor() {
        let a: u16 = 0b11001100_00110011;
        let b: u16 = 0b00101110_00001011;
        let r = _mm512_kxor(a, b);
        let e: u16 = 0b11100010_00111000;
        assert_eq!(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_kxor_mask16() {
        let a: u16 = 0b11001100_00110011;
        let b: u16 = 0b00101110_00001011;
        let r = _kxor_mask16(a, b);
        let e: u16 = 0b11100010_00111000;
        assert_eq!(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_knot() {
        let a: u16 = 0b11001100_00110011;
        let r = _mm512_knot(a);
        let e: u16 = 0b00110011_11001100;
        assert_eq!(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_knot_mask16() {
        let a: u16 = 0b11001100_00110011;
        let r = _knot_mask16(a);
        let e: u16 = 0b00110011_11001100;
        assert_eq!(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_kandn() {
        let a: u16 = 0b11001100_00110011;
        let b: u16 = 0b00101110_00001011;
        let r = _mm512_kandn(a, b);
        let e: u16 = 0b00100010_00001000;
        assert_eq!(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_kandn_mask16() {
        let a: u16 = 0b11001100_00110011;
        let b: u16 = 0b00101110_00001011;
        let r = _kandn_mask16(a, b);
        let e: u16 = 0b00100010_00001000;
        assert_eq!(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_kxnor() {
        let a: u16 = 0b11001100_00110011;
        let b: u16 = 0b00101110_00001011;
        let r = _mm512_kxnor(a, b);
        let e: u16 = 0b00011101_11000111;
        assert_eq!(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_kxnor_mask16() {
        let a: u16 = 0b11001100_00110011;
        let b: u16 = 0b00101110_00001011;
        let r = _kxnor_mask16(a, b);
        let e: u16 = 0b00011101_11000111;
        assert_eq!(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_kmov() {
        let a: u16 = 0b11001100_00110011;
        let r = _mm512_kmov(a);
        let e: u16 = 0b11001100_00110011;
        assert_eq!(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_int2mask() {
        let a: i32 = 0b11001100_00110011;
        let r = _mm512_int2mask(a);
        let e: u16 = 0b11001100_00110011;
        assert_eq!(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask2int() {
        let k1: __mmask16 = 0b11001100_00110011;
        let r = _mm512_mask2int(k1);
        let e: i32 = 0b11001100_00110011;
        assert_eq!(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_kunpackb() {
        let a: u16 = 0b11001100_00110011;
        let b: u16 = 0b00101110_00001011;
        let r = _mm512_kunpackb(a, b);
        let e: u16 = 0b00101110_00110011;
        assert_eq!(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_kortestc() {
        let a: u16 = 0b11001100_00110011;
        let b: u16 = 0b00101110_00001011;
        let r = _mm512_kortestc(a, b);
        assert_eq!(r, 0);
        let b: u16 = 0b11111111_11111111;
        let r = _mm512_kortestc(a, b);
        assert_eq!(r, 1);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_test_epi32_mask() {
        let a = _mm512_set1_epi32(1 << 0);
        let b = _mm512_set1_epi32(1 << 0 | 1 << 1);
        let r = _mm512_test_epi32_mask(a, b);
        let e: __mmask16 = 0b11111111_11111111;
        assert_eq!(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_test_epi32_mask() {
        let a = _mm512_set1_epi32(1 << 0);
        let b = _mm512_set1_epi32(1 << 0 | 1 << 1);
        let r = _mm512_mask_test_epi32_mask(0, a, b);
        assert_eq!(r, 0);
        let r = _mm512_mask_test_epi32_mask(0b11111111_11111111, a, b);
        let e: __mmask16 = 0b11111111_11111111;
        assert_eq!(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_test_epi32_mask() {
        let a = _mm256_set1_epi32(1 << 0);
        let b = _mm256_set1_epi32(1 << 0 | 1 << 1);
        let r = _mm256_test_epi32_mask(a, b);
        let e: __mmask8 = 0b11111111;
        assert_eq!(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_test_epi32_mask() {
        let a = _mm256_set1_epi32(1 << 0);
        let b = _mm256_set1_epi32(1 << 0 | 1 << 1);
        let r = _mm256_mask_test_epi32_mask(0, a, b);
        assert_eq!(r, 0);
        let r = _mm256_mask_test_epi32_mask(0b11111111, a, b);
        let e: __mmask8 = 0b11111111;
        assert_eq!(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_test_epi32_mask() {
        let a = _mm_set1_epi32(1 << 0);
        let b = _mm_set1_epi32(1 << 0 | 1 << 1);
        let r = _mm_test_epi32_mask(a, b);
        let e: __mmask8 = 0b00001111;
        assert_eq!(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_test_epi32_mask() {
        let a = _mm_set1_epi32(1 << 0);
        let b = _mm_set1_epi32(1 << 0 | 1 << 1);
        let r = _mm_mask_test_epi32_mask(0, a, b);
        assert_eq!(r, 0);
        let r = _mm_mask_test_epi32_mask(0b11111111, a, b);
        let e: __mmask8 = 0b00001111;
        assert_eq!(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_testn_epi32_mask() {
        let a = _mm512_set1_epi32(1 << 0);
        let b = _mm512_set1_epi32(1 << 0 | 1 << 1);
        let r = _mm512_testn_epi32_mask(a, b);
        let e: __mmask16 = 0b00000000_00000000;
        assert_eq!(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_testn_epi32_mask() {
        let a = _mm512_set1_epi32(1 << 0);
        let b = _mm512_set1_epi32(1 << 1);
        let r = _mm512_mask_test_epi32_mask(0, a, b);
        assert_eq!(r, 0);
        let r = _mm512_mask_testn_epi32_mask(0b11111111_11111111, a, b);
        let e: __mmask16 = 0b11111111_11111111;
        assert_eq!(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_testn_epi32_mask() {
        let a = _mm256_set1_epi32(1 << 0);
        let b = _mm256_set1_epi32(1 << 1);
        let r = _mm256_testn_epi32_mask(a, b);
        let e: __mmask8 = 0b11111111;
        assert_eq!(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_testn_epi32_mask() {
        let a = _mm256_set1_epi32(1 << 0);
        let b = _mm256_set1_epi32(1 << 1);
        let r = _mm256_mask_test_epi32_mask(0, a, b);
        assert_eq!(r, 0);
        let r = _mm256_mask_testn_epi32_mask(0b11111111, a, b);
        let e: __mmask8 = 0b11111111;
        assert_eq!(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_testn_epi32_mask() {
        let a = _mm_set1_epi32(1 << 0);
        let b = _mm_set1_epi32(1 << 1);
        let r = _mm_testn_epi32_mask(a, b);
        let e: __mmask8 = 0b00001111;
        assert_eq!(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_testn_epi32_mask() {
        let a = _mm_set1_epi32(1 << 0);
        let b = _mm_set1_epi32(1 << 1);
        let r = _mm_mask_test_epi32_mask(0, a, b);
        assert_eq!(r, 0);
        let r = _mm_mask_testn_epi32_mask(0b11111111, a, b);
        let e: __mmask8 = 0b00001111;
        assert_eq!(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_stream_ps() {
        #[repr(align(32))]
        struct Memory {
            pub data: [f32; 16],
        }
        let a = _mm512_set1_ps(7.0);
        let mut mem = Memory { data: [-1.0; 16] };

        _mm512_stream_ps(&mut mem.data[0] as *mut f32, a);
        for i in 0..16 {
            assert_eq!(mem.data[i], get_m512(a, i));
        }
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_reduce_add_epi32() {
        let a = _mm512_set1_epi32(1);
        let e: i32 = _mm512_reduce_add_epi32(a);
        assert_eq!(16, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_reduce_add_epi32() {
        let a = _mm512_set1_epi32(1);
        let e: i32 = _mm512_mask_reduce_add_epi32(0b11111111_00000000, a);
        assert_eq!(8, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_reduce_add_ps() {
        let a = _mm512_set1_ps(1.);
        let e: f32 = _mm512_reduce_add_ps(a);
        assert_eq!(16., e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_reduce_add_ps() {
        let a = _mm512_set1_ps(1.);
        let e: f32 = _mm512_mask_reduce_add_ps(0b11111111_00000000, a);
        assert_eq!(8., e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_reduce_mul_epi32() {
        let a = _mm512_set1_epi32(2);
        let e: i32 = _mm512_reduce_mul_epi32(a);
        assert_eq!(65536, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_reduce_mul_epi32() {
        let a = _mm512_set1_epi32(2);
        let e: i32 = _mm512_mask_reduce_mul_epi32(0b11111111_00000000, a);
        assert_eq!(256, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_reduce_mul_ps() {
        let a = _mm512_set1_ps(2.);
        let e: f32 = _mm512_reduce_mul_ps(a);
        assert_eq!(65536., e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_reduce_mul_ps() {
        let a = _mm512_set1_ps(2.);
        let e: f32 = _mm512_mask_reduce_mul_ps(0b11111111_00000000, a);
        assert_eq!(256., e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_reduce_max_epi32() {
        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let e: i32 = _mm512_reduce_max_epi32(a);
        assert_eq!(15, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_reduce_max_epi32() {
        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let e: i32 = _mm512_mask_reduce_max_epi32(0b11111111_00000000, a);
        assert_eq!(7, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_reduce_max_epu32() {
        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let e: u32 = _mm512_reduce_max_epu32(a);
        assert_eq!(15, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_reduce_max_epu32() {
        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let e: u32 = _mm512_mask_reduce_max_epu32(0b11111111_00000000, a);
        assert_eq!(7, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_reduce_max_ps() {
        let a = _mm512_set_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        let e: f32 = _mm512_reduce_max_ps(a);
        assert_eq!(15., e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_reduce_max_ps() {
        let a = _mm512_set_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        let e: f32 = _mm512_mask_reduce_max_ps(0b11111111_00000000, a);
        assert_eq!(7., e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_reduce_min_epi32() {
        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let e: i32 = _mm512_reduce_min_epi32(a);
        assert_eq!(0, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_reduce_min_epi32() {
        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let e: i32 = _mm512_mask_reduce_min_epi32(0b11111111_00000000, a);
        assert_eq!(0, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_reduce_min_epu32() {
        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let e: u32 = _mm512_reduce_min_epu32(a);
        assert_eq!(0, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_reduce_min_epu32() {
        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let e: u32 = _mm512_mask_reduce_min_epu32(0b11111111_00000000, a);
        assert_eq!(0, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_reduce_min_ps() {
        let a = _mm512_set_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        let e: f32 = _mm512_reduce_min_ps(a);
        assert_eq!(0., e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_reduce_min_ps() {
        let a = _mm512_set_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        let e: f32 = _mm512_mask_reduce_min_ps(0b11111111_00000000, a);
        assert_eq!(0., e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_reduce_and_epi32() {
        let a = _mm512_set_epi32(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
        let e: i32 = _mm512_reduce_and_epi32(a);
        assert_eq!(0, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_reduce_and_epi32() {
        let a = _mm512_set_epi32(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
        let e: i32 = _mm512_mask_reduce_and_epi32(0b11111111_00000000, a);
        assert_eq!(1, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_reduce_or_epi32() {
        let a = _mm512_set_epi32(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
        let e: i32 = _mm512_reduce_or_epi32(a);
        assert_eq!(3, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_reduce_or_epi32() {
        let a = _mm512_set_epi32(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2);
        let e: i32 = _mm512_mask_reduce_and_epi32(0b11111111_00000000, a);
        assert_eq!(1, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_compress_epi32() {
        let src = _mm512_set1_epi32(200);
        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let r = _mm512_mask_compress_epi32(src, 0, a);
        assert_eq_m512i(r, src);
        let r = _mm512_mask_compress_epi32(src, 0b01010101_01010101, a);
        let e = _mm512_set_epi32(
            200, 200, 200, 200, 200, 200, 200, 200, 1, 3, 5, 7, 9, 11, 13, 15,
        );
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_compress_epi32() {
        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let r = _mm512_maskz_compress_epi32(0, a);
        assert_eq_m512i(r, _mm512_setzero_si512());
        let r = _mm512_maskz_compress_epi32(0b01010101_01010101, a);
        let e = _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 5, 7, 9, 11, 13, 15);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_compress_epi32() {
        let src = _mm256_set1_epi32(200);
        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
        let r = _mm256_mask_compress_epi32(src, 0, a);
        assert_eq_m256i(r, src);
        let r = _mm256_mask_compress_epi32(src, 0b01010101, a);
        let e = _mm256_set_epi32(200, 200, 200, 200, 1, 3, 5, 7);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_compress_epi32() {
        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
        let r = _mm256_maskz_compress_epi32(0, a);
        assert_eq_m256i(r, _mm256_setzero_si256());
        let r = _mm256_maskz_compress_epi32(0b01010101, a);
        let e = _mm256_set_epi32(0, 0, 0, 0, 1, 3, 5, 7);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_compress_epi32() {
        let src = _mm_set1_epi32(200);
        let a = _mm_set_epi32(0, 1, 2, 3);
        let r = _mm_mask_compress_epi32(src, 0, a);
        assert_eq_m128i(r, src);
        let r = _mm_mask_compress_epi32(src, 0b00000101, a);
        let e = _mm_set_epi32(200, 200, 1, 3);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_compress_epi32() {
        let a = _mm_set_epi32(0, 1, 2, 3);
        let r = _mm_maskz_compress_epi32(0, a);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm_maskz_compress_epi32(0b00000101, a);
        let e = _mm_set_epi32(0, 0, 1, 3);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_compress_ps() {
        let src = _mm512_set1_ps(200.);
        let a = _mm512_set_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        let r = _mm512_mask_compress_ps(src, 0, a);
        assert_eq_m512(r, src);
        let r = _mm512_mask_compress_ps(src, 0b01010101_01010101, a);
        let e = _mm512_set_ps(
            200., 200., 200., 200., 200., 200., 200., 200., 1., 3., 5., 7., 9., 11., 13., 15.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_compress_ps() {
        let a = _mm512_set_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        let r = _mm512_maskz_compress_ps(0, a);
        assert_eq_m512(r, _mm512_setzero_ps());
        let r = _mm512_maskz_compress_ps(0b01010101_01010101, a);
        let e = _mm512_set_ps(
            0., 0., 0., 0., 0., 0., 0., 0., 1., 3., 5., 7., 9., 11., 13., 15.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_compress_ps() {
        let src = _mm256_set1_ps(200.);
        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
        let r = _mm256_mask_compress_ps(src, 0, a);
        assert_eq_m256(r, src);
        let r = _mm256_mask_compress_ps(src, 0b01010101, a);
        let e = _mm256_set_ps(200., 200., 200., 200., 1., 3., 5., 7.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_compress_ps() {
        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
        let r = _mm256_maskz_compress_ps(0, a);
        assert_eq_m256(r, _mm256_setzero_ps());
        let r = _mm256_maskz_compress_ps(0b01010101, a);
        let e = _mm256_set_ps(0., 0., 0., 0., 1., 3., 5., 7.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_compress_ps() {
        let src = _mm_set1_ps(200.);
        let a = _mm_set_ps(0., 1., 2., 3.);
        let r = _mm_mask_compress_ps(src, 0, a);
        assert_eq_m128(r, src);
        let r = _mm_mask_compress_ps(src, 0b00000101, a);
        let e = _mm_set_ps(200., 200., 1., 3.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_compress_ps() {
        let a = _mm_set_ps(0., 1., 2., 3.);
        let r = _mm_maskz_compress_ps(0, a);
        assert_eq_m128(r, _mm_setzero_ps());
        let r = _mm_maskz_compress_ps(0b00000101, a);
        let e = _mm_set_ps(0., 0., 1., 3.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_compressstoreu_epi32() {
        let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
        let mut r = [0_i32; 16];
        _mm512_mask_compressstoreu_epi32(r.as_mut_ptr() as *mut _, 0, a);
        assert_eq!(&r, &[0_i32; 16]);
        _mm512_mask_compressstoreu_epi32(r.as_mut_ptr() as *mut _, 0b1111000011001010, a);
        assert_eq!(&r, &[2, 4, 7, 8, 13, 14, 15, 16, 0, 0, 0, 0, 0, 0, 0, 0]);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_compressstoreu_epi32() {
        let a = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
        let mut r = [0_i32; 8];
        _mm256_mask_compressstoreu_epi32(r.as_mut_ptr() as *mut _, 0, a);
        assert_eq!(&r, &[0_i32; 8]);
        _mm256_mask_compressstoreu_epi32(r.as_mut_ptr() as *mut _, 0b11001010, a);
        assert_eq!(&r, &[2, 4, 7, 8, 0, 0, 0, 0]);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_compressstoreu_epi32() {
        let a = _mm_setr_epi32(1, 2, 3, 4);
        let mut r = [0_i32; 4];
        _mm_mask_compressstoreu_epi32(r.as_mut_ptr() as *mut _, 0, a);
        assert_eq!(&r, &[0_i32; 4]);
        _mm_mask_compressstoreu_epi32(r.as_mut_ptr() as *mut _, 0b1011, a);
        assert_eq!(&r, &[1, 2, 4, 0]);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_compressstoreu_epi64() {
        let a = _mm512_setr_epi64(1, 2, 3, 4, 5, 6, 7, 8);
        let mut r = [0_i64; 8];
        _mm512_mask_compressstoreu_epi64(r.as_mut_ptr() as *mut _, 0, a);
        assert_eq!(&r, &[0_i64; 8]);
        _mm512_mask_compressstoreu_epi64(r.as_mut_ptr() as *mut _, 0b11001010, a);
        assert_eq!(&r, &[2, 4, 7, 8, 0, 0, 0, 0]);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_compressstoreu_epi64() {
        let a = _mm256_setr_epi64x(1, 2, 3, 4);
        let mut r = [0_i64; 4];
        _mm256_mask_compressstoreu_epi64(r.as_mut_ptr() as *mut _, 0, a);
        assert_eq!(&r, &[0_i64; 4]);
        _mm256_mask_compressstoreu_epi64(r.as_mut_ptr() as *mut _, 0b1011, a);
        assert_eq!(&r, &[1, 2, 4, 0]);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_compressstoreu_epi64() {
        let a = _mm_setr_epi64x(1, 2);
        let mut r = [0_i64; 2];
        _mm_mask_compressstoreu_epi64(r.as_mut_ptr() as *mut _, 0, a);
        assert_eq!(&r, &[0_i64; 2]);
        _mm_mask_compressstoreu_epi64(r.as_mut_ptr() as *mut _, 0b10, a);
        assert_eq!(&r, &[2, 0]);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_compressstoreu_ps() {
        let a = _mm512_setr_ps(
            1_f32, 2_f32, 3_f32, 4_f32, 5_f32, 6_f32, 7_f32, 8_f32, 9_f32, 10_f32, 11_f32, 12_f32,
            13_f32, 14_f32, 15_f32, 16_f32,
        );
        let mut r = [0_f32; 16];
        _mm512_mask_compressstoreu_ps(r.as_mut_ptr() as *mut _, 0, a);
        assert_eq!(&r, &[0_f32; 16]);
        _mm512_mask_compressstoreu_ps(r.as_mut_ptr() as *mut _, 0b1111000011001010, a);
        assert_eq!(
            &r,
            &[
                2_f32, 4_f32, 7_f32, 8_f32, 13_f32, 14_f32, 15_f32, 16_f32, 0_f32, 0_f32, 0_f32,
                0_f32, 0_f32, 0_f32, 0_f32, 0_f32
            ]
        );
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_compressstoreu_ps() {
        let a = _mm256_setr_ps(1_f32, 2_f32, 3_f32, 4_f32, 5_f32, 6_f32, 7_f32, 8_f32);
        let mut r = [0_f32; 8];
        _mm256_mask_compressstoreu_ps(r.as_mut_ptr() as *mut _, 0, a);
        assert_eq!(&r, &[0_f32; 8]);
        _mm256_mask_compressstoreu_ps(r.as_mut_ptr() as *mut _, 0b11001010, a);
        assert_eq!(
            &r,
            &[2_f32, 4_f32, 7_f32, 8_f32, 0_f32, 0_f32, 0_f32, 0_f32]
        );
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_compressstoreu_ps() {
        let a = _mm_setr_ps(1_f32, 2_f32, 3_f32, 4_f32);
        let mut r = [0.; 4];
        _mm_mask_compressstoreu_ps(r.as_mut_ptr() as *mut _, 0, a);
        assert_eq!(&r, &[0.; 4]);
        _mm_mask_compressstoreu_ps(r.as_mut_ptr() as *mut _, 0b1011, a);
        assert_eq!(&r, &[1_f32, 2_f32, 4_f32, 0_f32]);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_compressstoreu_pd() {
        let a = _mm512_setr_pd(1., 2., 3., 4., 5., 6., 7., 8.);
        let mut r = [0.; 8];
        _mm512_mask_compressstoreu_pd(r.as_mut_ptr() as *mut _, 0, a);
        assert_eq!(&r, &[0.; 8]);
        _mm512_mask_compressstoreu_pd(r.as_mut_ptr() as *mut _, 0b11001010, a);
        assert_eq!(&r, &[2., 4., 7., 8., 0., 0., 0., 0.]);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_compressstoreu_pd() {
        let a = _mm256_setr_pd(1., 2., 3., 4.);
        let mut r = [0.; 4];
        _mm256_mask_compressstoreu_pd(r.as_mut_ptr() as *mut _, 0, a);
        assert_eq!(&r, &[0.; 4]);
        _mm256_mask_compressstoreu_pd(r.as_mut_ptr() as *mut _, 0b1011, a);
        assert_eq!(&r, &[1., 2., 4., 0.]);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_compressstoreu_pd() {
        let a = _mm_setr_pd(1., 2.);
        let mut r = [0.; 2];
        _mm_mask_compressstoreu_pd(r.as_mut_ptr() as *mut _, 0, a);
        assert_eq!(&r, &[0.; 2]);
        _mm_mask_compressstoreu_pd(r.as_mut_ptr() as *mut _, 0b10, a);
        assert_eq!(&r, &[2., 0.]);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_expand_epi32() {
        let src = _mm512_set1_epi32(200);
        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let r = _mm512_mask_expand_epi32(src, 0, a);
        assert_eq_m512i(r, src);
        let r = _mm512_mask_expand_epi32(src, 0b01010101_01010101, a);
        let e = _mm512_set_epi32(
            200, 8, 200, 9, 200, 10, 200, 11, 200, 12, 200, 13, 200, 14, 200, 15,
        );
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_expand_epi32() {
        let a = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
        let r = _mm512_maskz_expand_epi32(0, a);
        assert_eq_m512i(r, _mm512_setzero_si512());
        let r = _mm512_maskz_expand_epi32(0b01010101_01010101, a);
        let e = _mm512_set_epi32(0, 8, 0, 9, 0, 10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_expand_epi32() {
        let src = _mm256_set1_epi32(200);
        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
        let r = _mm256_mask_expand_epi32(src, 0, a);
        assert_eq_m256i(r, src);
        let r = _mm256_mask_expand_epi32(src, 0b01010101, a);
        let e = _mm256_set_epi32(200, 4, 200, 5, 200, 6, 200, 7);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_expand_epi32() {
        let a = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
        let r = _mm256_maskz_expand_epi32(0, a);
        assert_eq_m256i(r, _mm256_setzero_si256());
        let r = _mm256_maskz_expand_epi32(0b01010101, a);
        let e = _mm256_set_epi32(0, 4, 0, 5, 0, 6, 0, 7);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_expand_epi32() {
        let src = _mm_set1_epi32(200);
        let a = _mm_set_epi32(0, 1, 2, 3);
        let r = _mm_mask_expand_epi32(src, 0, a);
        assert_eq_m128i(r, src);
        let r = _mm_mask_expand_epi32(src, 0b00000101, a);
        let e = _mm_set_epi32(200, 2, 200, 3);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_expand_epi32() {
        let a = _mm_set_epi32(0, 1, 2, 3);
        let r = _mm_maskz_expand_epi32(0, a);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm_maskz_expand_epi32(0b00000101, a);
        let e = _mm_set_epi32(0, 2, 0, 3);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_expand_ps() {
        let src = _mm512_set1_ps(200.);
        let a = _mm512_set_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        let r = _mm512_mask_expand_ps(src, 0, a);
        assert_eq_m512(r, src);
        let r = _mm512_mask_expand_ps(src, 0b01010101_01010101, a);
        let e = _mm512_set_ps(
            200., 8., 200., 9., 200., 10., 200., 11., 200., 12., 200., 13., 200., 14., 200., 15.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_expand_ps() {
        let a = _mm512_set_ps(
            0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.,
        );
        let r = _mm512_maskz_expand_ps(0, a);
        assert_eq_m512(r, _mm512_setzero_ps());
        let r = _mm512_maskz_expand_ps(0b01010101_01010101, a);
        let e = _mm512_set_ps(
            0., 8., 0., 9., 0., 10., 0., 11., 0., 12., 0., 13., 0., 14., 0., 15.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_expand_ps() {
        let src = _mm256_set1_ps(200.);
        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
        let r = _mm256_mask_expand_ps(src, 0, a);
        assert_eq_m256(r, src);
        let r = _mm256_mask_expand_ps(src, 0b01010101, a);
        let e = _mm256_set_ps(200., 4., 200., 5., 200., 6., 200., 7.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_expand_ps() {
        let a = _mm256_set_ps(0., 1., 2., 3., 4., 5., 6., 7.);
        let r = _mm256_maskz_expand_ps(0, a);
        assert_eq_m256(r, _mm256_setzero_ps());
        let r = _mm256_maskz_expand_ps(0b01010101, a);
        let e = _mm256_set_ps(0., 4., 0., 5., 0., 6., 0., 7.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_expand_ps() {
        let src = _mm_set1_ps(200.);
        let a = _mm_set_ps(0., 1., 2., 3.);
        let r = _mm_mask_expand_ps(src, 0, a);
        assert_eq_m128(r, src);
        let r = _mm_mask_expand_ps(src, 0b00000101, a);
        let e = _mm_set_ps(200., 2., 200., 3.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_expand_ps() {
        let a = _mm_set_ps(0., 1., 2., 3.);
        let r = _mm_maskz_expand_ps(0, a);
        assert_eq_m128(r, _mm_setzero_ps());
        let r = _mm_maskz_expand_ps(0b00000101, a);
        let e = _mm_set_ps(0., 2., 0., 3.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_loadu_epi32() {
        let a = &[4, 3, 2, 5, 8, 9, 64, 50, -4, -3, -2, -5, -8, -9, -64, -50];
        let p = a.as_ptr();
        let r = _mm512_loadu_epi32(black_box(p));
        let e = _mm512_setr_epi32(4, 3, 2, 5, 8, 9, 64, 50, -4, -3, -2, -5, -8, -9, -64, -50);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_loadu_epi32() {
        let a = &[4, 3, 2, 5, 8, 9, 64, 50];
        let p = a.as_ptr();
        let r = _mm256_loadu_epi32(black_box(p));
        let e = _mm256_setr_epi32(4, 3, 2, 5, 8, 9, 64, 50);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_loadu_epi32() {
        let a = &[4, 3, 2, 5];
        let p = a.as_ptr();
        let r = _mm_loadu_epi32(black_box(p));
        let e = _mm_setr_epi32(4, 3, 2, 5);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_cvtepi32_storeu_epi16() {
        let a = _mm512_set1_epi32(9);
        let mut r = _mm256_undefined_si256();
        _mm512_mask_cvtepi32_storeu_epi16(&mut r as *mut _ as *mut i8, 0b11111111_11111111, a);
        let e = _mm256_set1_epi16(9);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_cvtepi32_storeu_epi16() {
        let a = _mm256_set1_epi32(9);
        let mut r = _mm_undefined_si128();
        _mm256_mask_cvtepi32_storeu_epi16(&mut r as *mut _ as *mut i8, 0b11111111, a);
        let e = _mm_set1_epi16(9);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_cvtepi32_storeu_epi16() {
        let a = _mm_set1_epi32(9);
        let mut r = _mm_set1_epi8(0);
        _mm_mask_cvtepi32_storeu_epi16(&mut r as *mut _ as *mut i8, 0b11111111, a);
        let e = _mm_set_epi16(0, 0, 0, 0, 9, 9, 9, 9);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_cvtsepi32_storeu_epi16() {
        let a = _mm512_set1_epi32(i32::MAX);
        let mut r = _mm256_undefined_si256();
        _mm512_mask_cvtsepi32_storeu_epi16(&mut r as *mut _ as *mut i8, 0b11111111_11111111, a);
        let e = _mm256_set1_epi16(i16::MAX);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_cvtsepi32_storeu_epi16() {
        let a = _mm256_set1_epi32(i32::MAX);
        let mut r = _mm_undefined_si128();
        _mm256_mask_cvtsepi32_storeu_epi16(&mut r as *mut _ as *mut i8, 0b11111111, a);
        let e = _mm_set1_epi16(i16::MAX);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_cvtsepi32_storeu_epi16() {
        let a = _mm_set1_epi32(i32::MAX);
        let mut r = _mm_set1_epi8(0);
        _mm_mask_cvtsepi32_storeu_epi16(&mut r as *mut _ as *mut i8, 0b11111111, a);
        let e = _mm_set_epi16(0, 0, 0, 0, i16::MAX, i16::MAX, i16::MAX, i16::MAX);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_cvtusepi32_storeu_epi16() {
        let a = _mm512_set1_epi32(i32::MAX);
        let mut r = _mm256_undefined_si256();
        _mm512_mask_cvtusepi32_storeu_epi16(&mut r as *mut _ as *mut i8, 0b11111111_11111111, a);
        let e = _mm256_set1_epi16(u16::MAX as i16);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_cvtusepi32_storeu_epi16() {
        let a = _mm256_set1_epi32(i32::MAX);
        let mut r = _mm_undefined_si128();
        _mm256_mask_cvtusepi32_storeu_epi16(&mut r as *mut _ as *mut i8, 0b11111111, a);
        let e = _mm_set1_epi16(u16::MAX as i16);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_cvtusepi32_storeu_epi16() {
        let a = _mm_set1_epi32(i32::MAX);
        let mut r = _mm_set1_epi8(0);
        _mm_mask_cvtusepi32_storeu_epi16(&mut r as *mut _ as *mut i8, 0b11111111, a);
        let e = _mm_set_epi16(
            0,
            0,
            0,
            0,
            u16::MAX as i16,
            u16::MAX as i16,
            u16::MAX as i16,
            u16::MAX as i16,
        );
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_cvtepi32_storeu_epi8() {
        let a = _mm512_set1_epi32(9);
        let mut r = _mm_undefined_si128();
        _mm512_mask_cvtepi32_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111_11111111, a);
        let e = _mm_set1_epi8(9);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_cvtepi32_storeu_epi8() {
        let a = _mm256_set1_epi32(9);
        let mut r = _mm_set1_epi8(0);
        _mm256_mask_cvtepi32_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 9, 9, 9, 9, 9, 9, 9, 9);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_cvtepi32_storeu_epi8() {
        let a = _mm_set1_epi32(9);
        let mut r = _mm_set1_epi8(0);
        _mm_mask_cvtepi32_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
        let e = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 9, 9, 9);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_cvtsepi32_storeu_epi8() {
        let a = _mm512_set1_epi32(i32::MAX);
        let mut r = _mm_undefined_si128();
        _mm512_mask_cvtsepi32_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111_11111111, a);
        let e = _mm_set1_epi8(i8::MAX);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_cvtsepi32_storeu_epi8() {
        let a = _mm256_set1_epi32(i32::MAX);
        let mut r = _mm_set1_epi8(0);
        _mm256_mask_cvtsepi32_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
        #[rustfmt::skip]
        let e = _mm_set_epi8(
            0, 0, 0, 0,
            0, 0, 0, 0,
            i8::MAX, i8::MAX, i8::MAX, i8::MAX,
            i8::MAX, i8::MAX, i8::MAX, i8::MAX,
        );
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_cvtsepi32_storeu_epi8() {
        let a = _mm_set1_epi32(i32::MAX);
        let mut r = _mm_set1_epi8(0);
        _mm_mask_cvtsepi32_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
        #[rustfmt::skip]
        let e = _mm_set_epi8(
            0, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 0,
            i8::MAX, i8::MAX, i8::MAX, i8::MAX,
        );
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_cvtusepi32_storeu_epi8() {
        let a = _mm512_set1_epi32(i32::MAX);
        let mut r = _mm_undefined_si128();
        _mm512_mask_cvtusepi32_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111_11111111, a);
        let e = _mm_set1_epi8(u8::MAX as i8);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_cvtusepi32_storeu_epi8() {
        let a = _mm256_set1_epi32(i32::MAX);
        let mut r = _mm_set1_epi8(0);
        _mm256_mask_cvtusepi32_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
        #[rustfmt::skip]
        let e = _mm_set_epi8(
            0, 0, 0, 0,
            0, 0, 0, 0,
            u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8,
            u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8,
        );
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_cvtusepi32_storeu_epi8() {
        let a = _mm_set1_epi32(i32::MAX);
        let mut r = _mm_set1_epi8(0);
        _mm_mask_cvtusepi32_storeu_epi8(&mut r as *mut _ as *mut i8, 0b11111111, a);
        #[rustfmt::skip]
        let e = _mm_set_epi8(
            0, 0, 0, 0,
            0, 0, 0, 0,
            0, 0, 0, 0,
            u8::MAX as i8, u8::MAX as i8, u8::MAX as i8, u8::MAX as i8,
        );
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_storeu_epi32() {
        let a = _mm512_set1_epi32(9);
        let mut r = _mm512_undefined_epi32();
        _mm512_storeu_epi32(&mut r as *mut _ as *mut i32, a);
        assert_eq_m512i(r, a);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_storeu_epi32() {
        let a = _mm256_set1_epi32(9);
        let mut r = _mm256_undefined_si256();
        _mm256_storeu_epi32(&mut r as *mut _ as *mut i32, a);
        assert_eq_m256i(r, a);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_storeu_epi32() {
        let a = _mm_set1_epi32(9);
        let mut r = _mm_undefined_si128();
        _mm_storeu_epi32(&mut r as *mut _ as *mut i32, a);
        assert_eq_m128i(r, a);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_loadu_si512() {
        let a = &[4, 3, 2, 5, 8, 9, 64, 50, -4, -3, -2, -5, -8, -9, -64, -50];
        let p = a.as_ptr();
        let r = _mm512_loadu_si512(black_box(p));
        let e = _mm512_setr_epi32(4, 3, 2, 5, 8, 9, 64, 50, -4, -3, -2, -5, -8, -9, -64, -50);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_storeu_si512() {
        let a = _mm512_set1_epi32(9);
        let mut r = _mm512_undefined_epi32();
        _mm512_storeu_si512(&mut r as *mut _ as *mut i32, a);
        assert_eq_m512i(r, a);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_load_si512() {
        #[repr(align(64))]
        struct Align {
            data: [i32; 16], // 64 bytes
        }
        let a = Align {
            data: [4, 3, 2, 5, 8, 9, 64, 50, -4, -3, -2, -5, -8, -9, -64, -50],
        };
        let p = (a.data).as_ptr();
        let r = _mm512_load_si512(black_box(p));
        let e = _mm512_setr_epi32(4, 3, 2, 5, 8, 9, 64, 50, -4, -3, -2, -5, -8, -9, -64, -50);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_store_si512() {
        let a = _mm512_set1_epi32(9);
        let mut r = _mm512_undefined_epi32();
        _mm512_store_si512(&mut r as *mut _ as *mut i32, a);
        assert_eq_m512i(r, a);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_load_epi32() {
        #[repr(align(64))]
        struct Align {
            data: [i32; 16], // 64 bytes
        }
        let a = Align {
            data: [4, 3, 2, 5, 8, 9, 64, 50, -4, -3, -2, -5, -8, -9, -64, -50],
        };
        let p = (a.data).as_ptr();
        let r = _mm512_load_epi32(black_box(p));
        let e = _mm512_setr_epi32(4, 3, 2, 5, 8, 9, 64, 50, -4, -3, -2, -5, -8, -9, -64, -50);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_load_epi32() {
        #[repr(align(64))]
        struct Align {
            data: [i32; 8],
        }
        let a = Align {
            data: [4, 3, 2, 5, 8, 9, 64, 50],
        };
        let p = (a.data).as_ptr();
        let r = _mm256_load_epi32(black_box(p));
        let e = _mm256_setr_epi32(4, 3, 2, 5, 8, 9, 64, 50);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_load_epi32() {
        #[repr(align(64))]
        struct Align {
            data: [i32; 4],
        }
        let a = Align { data: [4, 3, 2, 5] };
        let p = (a.data).as_ptr();
        let r = _mm_load_epi32(black_box(p));
        let e = _mm_setr_epi32(4, 3, 2, 5);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_store_epi32() {
        let a = _mm512_set1_epi32(9);
        let mut r = _mm512_undefined_epi32();
        _mm512_store_epi32(&mut r as *mut _ as *mut i32, a);
        assert_eq_m512i(r, a);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_store_epi32() {
        let a = _mm256_set1_epi32(9);
        let mut r = _mm256_undefined_si256();
        _mm256_store_epi32(&mut r as *mut _ as *mut i32, a);
        assert_eq_m256i(r, a);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_store_epi32() {
        let a = _mm_set1_epi32(9);
        let mut r = _mm_undefined_si128();
        _mm_store_epi32(&mut r as *mut _ as *mut i32, a);
        assert_eq_m128i(r, a);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_load_ps() {
        #[repr(align(64))]
        struct Align {
            data: [f32; 16], // 64 bytes
        }
        let a = Align {
            data: [
                4., 3., 2., 5., 8., 9., 64., 50., -4., -3., -2., -5., -8., -9., -64., -50.,
            ],
        };
        let p = (a.data).as_ptr();
        let r = _mm512_load_ps(black_box(p));
        let e = _mm512_setr_ps(
            4., 3., 2., 5., 8., 9., 64., 50., -4., -3., -2., -5., -8., -9., -64., -50.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_store_ps() {
        let a = _mm512_set1_ps(9.);
        let mut r = _mm512_undefined_ps();
        _mm512_store_ps(&mut r as *mut _ as *mut f32, a);
        assert_eq_m512(r, a);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_set1_epi32() {
        let src = _mm512_set1_epi32(2);
        let a: i32 = 11;
        let r = _mm512_mask_set1_epi32(src, 0, a);
        assert_eq_m512i(r, src);
        let r = _mm512_mask_set1_epi32(src, 0b11111111_11111111, a);
        let e = _mm512_set1_epi32(11);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_set1_epi32() {
        let a: i32 = 11;
        let r = _mm512_maskz_set1_epi32(0, a);
        assert_eq_m512i(r, _mm512_setzero_si512());
        let r = _mm512_maskz_set1_epi32(0b11111111_11111111, a);
        let e = _mm512_set1_epi32(11);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_set1_epi32() {
        let src = _mm256_set1_epi32(2);
        let a: i32 = 11;
        let r = _mm256_mask_set1_epi32(src, 0, a);
        assert_eq_m256i(r, src);
        let r = _mm256_mask_set1_epi32(src, 0b11111111, a);
        let e = _mm256_set1_epi32(11);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm256_maskz_set1_epi32() {
        let a: i32 = 11;
        let r = _mm256_maskz_set1_epi32(0, a);
        assert_eq_m256i(r, _mm256_setzero_si256());
        let r = _mm256_maskz_set1_epi32(0b11111111, a);
        let e = _mm256_set1_epi32(11);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_set1_epi32() {
        let src = _mm_set1_epi32(2);
        let a: i32 = 11;
        let r = _mm_mask_set1_epi32(src, 0, a);
        assert_eq_m128i(r, src);
        let r = _mm_mask_set1_epi32(src, 0b00001111, a);
        let e = _mm_set1_epi32(11);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_set1_epi32() {
        let a: i32 = 11;
        let r = _mm_maskz_set1_epi32(0, a);
        assert_eq_m128i(r, _mm_setzero_si128());
        let r = _mm_maskz_set1_epi32(0b00001111, a);
        let e = _mm_set1_epi32(11);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_move_ss() {
        let src = _mm_set_ps(10., 11., 100., 110.);
        let a = _mm_set_ps(1., 2., 10., 20.);
        let b = _mm_set_ps(3., 4., 30., 40.);
        let r = _mm_mask_move_ss(src, 0, a, b);
        let e = _mm_set_ps(1., 2., 10., 110.);
        assert_eq_m128(r, e);
        let r = _mm_mask_move_ss(src, 0b11111111, a, b);
        let e = _mm_set_ps(1., 2., 10., 40.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_move_ss() {
        let a = _mm_set_ps(1., 2., 10., 20.);
        let b = _mm_set_ps(3., 4., 30., 40.);
        let r = _mm_maskz_move_ss(0, a, b);
        let e = _mm_set_ps(1., 2., 10., 0.);
        assert_eq_m128(r, e);
        let r = _mm_maskz_move_ss(0b11111111, a, b);
        let e = _mm_set_ps(1., 2., 10., 40.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_move_sd() {
        let src = _mm_set_pd(10., 11.);
        let a = _mm_set_pd(1., 2.);
        let b = _mm_set_pd(3., 4.);
        let r = _mm_mask_move_sd(src, 0, a, b);
        let e = _mm_set_pd(1., 11.);
        assert_eq_m128d(r, e);
        let r = _mm_mask_move_sd(src, 0b11111111, a, b);
        let e = _mm_set_pd(1., 4.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_move_sd() {
        let a = _mm_set_pd(1., 2.);
        let b = _mm_set_pd(3., 4.);
        let r = _mm_maskz_move_sd(0, a, b);
        let e = _mm_set_pd(1., 0.);
        assert_eq_m128d(r, e);
        let r = _mm_maskz_move_sd(0b11111111, a, b);
        let e = _mm_set_pd(1., 4.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_add_ss() {
        let src = _mm_set_ps(10., 11., 100., 110.);
        let a = _mm_set_ps(1., 2., 10., 20.);
        let b = _mm_set_ps(3., 4., 30., 40.);
        let r = _mm_mask_add_ss(src, 0, a, b);
        let e = _mm_set_ps(1., 2., 10., 110.);
        assert_eq_m128(r, e);
        let r = _mm_mask_add_ss(src, 0b11111111, a, b);
        let e = _mm_set_ps(1., 2., 10., 60.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_add_ss() {
        let a = _mm_set_ps(1., 2., 10., 20.);
        let b = _mm_set_ps(3., 4., 30., 40.);
        let r = _mm_maskz_add_ss(0, a, b);
        let e = _mm_set_ps(1., 2., 10., 0.);
        assert_eq_m128(r, e);
        let r = _mm_maskz_add_ss(0b11111111, a, b);
        let e = _mm_set_ps(1., 2., 10., 60.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_add_sd() {
        let src = _mm_set_pd(10., 11.);
        let a = _mm_set_pd(1., 2.);
        let b = _mm_set_pd(3., 4.);
        let r = _mm_mask_add_sd(src, 0, a, b);
        let e = _mm_set_pd(1., 11.);
        assert_eq_m128d(r, e);
        let r = _mm_mask_add_sd(src, 0b11111111, a, b);
        let e = _mm_set_pd(1., 6.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_add_sd() {
        let a = _mm_set_pd(1., 2.);
        let b = _mm_set_pd(3., 4.);
        let r = _mm_maskz_add_sd(0, a, b);
        let e = _mm_set_pd(1., 0.);
        assert_eq_m128d(r, e);
        let r = _mm_maskz_add_sd(0b11111111, a, b);
        let e = _mm_set_pd(1., 6.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_sub_ss() {
        let src = _mm_set_ps(10., 11., 100., 110.);
        let a = _mm_set_ps(1., 2., 10., 20.);
        let b = _mm_set_ps(3., 4., 30., 40.);
        let r = _mm_mask_sub_ss(src, 0, a, b);
        let e = _mm_set_ps(1., 2., 10., 110.);
        assert_eq_m128(r, e);
        let r = _mm_mask_sub_ss(src, 0b11111111, a, b);
        let e = _mm_set_ps(1., 2., 10., -20.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_sub_ss() {
        let a = _mm_set_ps(1., 2., 10., 20.);
        let b = _mm_set_ps(3., 4., 30., 40.);
        let r = _mm_maskz_sub_ss(0, a, b);
        let e = _mm_set_ps(1., 2., 10., 0.);
        assert_eq_m128(r, e);
        let r = _mm_maskz_sub_ss(0b11111111, a, b);
        let e = _mm_set_ps(1., 2., 10., -20.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_sub_sd() {
        let src = _mm_set_pd(10., 11.);
        let a = _mm_set_pd(1., 2.);
        let b = _mm_set_pd(3., 4.);
        let r = _mm_mask_sub_sd(src, 0, a, b);
        let e = _mm_set_pd(1., 11.);
        assert_eq_m128d(r, e);
        let r = _mm_mask_sub_sd(src, 0b11111111, a, b);
        let e = _mm_set_pd(1., -2.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_sub_sd() {
        let a = _mm_set_pd(1., 2.);
        let b = _mm_set_pd(3., 4.);
        let r = _mm_maskz_sub_sd(0, a, b);
        let e = _mm_set_pd(1., 0.);
        assert_eq_m128d(r, e);
        let r = _mm_maskz_sub_sd(0b11111111, a, b);
        let e = _mm_set_pd(1., -2.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_mul_ss() {
        let src = _mm_set_ps(10., 11., 100., 110.);
        let a = _mm_set_ps(1., 2., 10., 20.);
        let b = _mm_set_ps(3., 4., 30., 40.);
        let r = _mm_mask_mul_ss(src, 0, a, b);
        let e = _mm_set_ps(1., 2., 10., 110.);
        assert_eq_m128(r, e);
        let r = _mm_mask_mul_ss(src, 0b11111111, a, b);
        let e = _mm_set_ps(1., 2., 10., 800.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_mul_ss() {
        let a = _mm_set_ps(1., 2., 10., 20.);
        let b = _mm_set_ps(3., 4., 30., 40.);
        let r = _mm_maskz_mul_ss(0, a, b);
        let e = _mm_set_ps(1., 2., 10., 0.);
        assert_eq_m128(r, e);
        let r = _mm_maskz_mul_ss(0b11111111, a, b);
        let e = _mm_set_ps(1., 2., 10., 800.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_mul_sd() {
        let src = _mm_set_pd(10., 11.);
        let a = _mm_set_pd(1., 2.);
        let b = _mm_set_pd(3., 4.);
        let r = _mm_mask_mul_sd(src, 0, a, b);
        let e = _mm_set_pd(1., 11.);
        assert_eq_m128d(r, e);
        let r = _mm_mask_mul_sd(src, 0b11111111, a, b);
        let e = _mm_set_pd(1., 8.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_mul_sd() {
        let a = _mm_set_pd(1., 2.);
        let b = _mm_set_pd(3., 4.);
        let r = _mm_maskz_mul_sd(0, a, b);
        let e = _mm_set_pd(1., 0.);
        assert_eq_m128d(r, e);
        let r = _mm_maskz_mul_sd(0b11111111, a, b);
        let e = _mm_set_pd(1., 8.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_div_ss() {
        let src = _mm_set_ps(10., 11., 100., 110.);
        let a = _mm_set_ps(1., 2., 10., 20.);
        let b = _mm_set_ps(3., 4., 30., 40.);
        let r = _mm_mask_div_ss(src, 0, a, b);
        let e = _mm_set_ps(1., 2., 10., 110.);
        assert_eq_m128(r, e);
        let r = _mm_mask_div_ss(src, 0b11111111, a, b);
        let e = _mm_set_ps(1., 2., 10., 0.5);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_div_ss() {
        let a = _mm_set_ps(1., 2., 10., 20.);
        let b = _mm_set_ps(3., 4., 30., 40.);
        let r = _mm_maskz_div_ss(0, a, b);
        let e = _mm_set_ps(1., 2., 10., 0.);
        assert_eq_m128(r, e);
        let r = _mm_maskz_div_ss(0b11111111, a, b);
        let e = _mm_set_ps(1., 2., 10., 0.5);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_div_sd() {
        let src = _mm_set_pd(10., 11.);
        let a = _mm_set_pd(1., 2.);
        let b = _mm_set_pd(3., 4.);
        let r = _mm_mask_div_sd(src, 0, a, b);
        let e = _mm_set_pd(1., 11.);
        assert_eq_m128d(r, e);
        let r = _mm_mask_div_sd(src, 0b11111111, a, b);
        let e = _mm_set_pd(1., 0.5);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_div_sd() {
        let a = _mm_set_pd(1., 2.);
        let b = _mm_set_pd(3., 4.);
        let r = _mm_maskz_div_sd(0, a, b);
        let e = _mm_set_pd(1., 0.);
        assert_eq_m128d(r, e);
        let r = _mm_maskz_div_sd(0b11111111, a, b);
        let e = _mm_set_pd(1., 0.5);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_max_ss() {
        let a = _mm_set_ps(0., 1., 2., 3.);
        let b = _mm_set_ps(4., 5., 6., 7.);
        let r = _mm_mask_max_ss(a, 0, a, b);
        let e = _mm_set_ps(0., 1., 2., 3.);
        assert_eq_m128(r, e);
        let r = _mm_mask_max_ss(a, 0b11111111, a, b);
        let e = _mm_set_ps(0., 1., 2., 7.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_max_ss() {
        let a = _mm_set_ps(0., 1., 2., 3.);
        let b = _mm_set_ps(4., 5., 6., 7.);
        let r = _mm_maskz_max_ss(0, a, b);
        let e = _mm_set_ps(0., 1., 2., 0.);
        assert_eq_m128(r, e);
        let r = _mm_maskz_max_ss(0b11111111, a, b);
        let e = _mm_set_ps(0., 1., 2., 7.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_max_sd() {
        let a = _mm_set_pd(0., 1.);
        let b = _mm_set_pd(2., 3.);
        let r = _mm_mask_max_sd(a, 0, a, b);
        let e = _mm_set_pd(0., 1.);
        assert_eq_m128d(r, e);
        let r = _mm_mask_max_sd(a, 0b11111111, a, b);
        let e = _mm_set_pd(0., 3.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_max_sd() {
        let a = _mm_set_pd(0., 1.);
        let b = _mm_set_pd(2., 3.);
        let r = _mm_maskz_max_sd(0, a, b);
        let e = _mm_set_pd(0., 0.);
        assert_eq_m128d(r, e);
        let r = _mm_maskz_max_sd(0b11111111, a, b);
        let e = _mm_set_pd(0., 3.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_min_ss() {
        let a = _mm_set_ps(0., 1., 2., 3.);
        let b = _mm_set_ps(4., 5., 6., 7.);
        let r = _mm_mask_min_ss(a, 0, a, b);
        let e = _mm_set_ps(0., 1., 2., 3.);
        assert_eq_m128(r, e);
        let r = _mm_mask_min_ss(a, 0b11111111, a, b);
        let e = _mm_set_ps(0., 1., 2., 3.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_min_ss() {
        let a = _mm_set_ps(0., 1., 2., 3.);
        let b = _mm_set_ps(4., 5., 6., 7.);
        let r = _mm_maskz_min_ss(0, a, b);
        let e = _mm_set_ps(0., 1., 2., 0.);
        assert_eq_m128(r, e);
        let r = _mm_maskz_min_ss(0b11111111, a, b);
        let e = _mm_set_ps(0., 1., 2., 3.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_min_sd() {
        let a = _mm_set_pd(0., 1.);
        let b = _mm_set_pd(2., 3.);
        let r = _mm_mask_min_sd(a, 0, a, b);
        let e = _mm_set_pd(0., 1.);
        assert_eq_m128d(r, e);
        let r = _mm_mask_min_sd(a, 0b11111111, a, b);
        let e = _mm_set_pd(0., 1.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_min_sd() {
        let a = _mm_set_pd(0., 1.);
        let b = _mm_set_pd(2., 3.);
        let r = _mm_maskz_min_sd(0, a, b);
        let e = _mm_set_pd(0., 0.);
        assert_eq_m128d(r, e);
        let r = _mm_maskz_min_sd(0b11111111, a, b);
        let e = _mm_set_pd(0., 1.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_sqrt_ss() {
        let src = _mm_set_ps(10., 11., 100., 110.);
        let a = _mm_set_ps(1., 2., 10., 20.);
        let b = _mm_set_ps(3., 4., 30., 4.);
        let r = _mm_mask_sqrt_ss(src, 0, a, b);
        let e = _mm_set_ps(1., 2., 10., 110.);
        assert_eq_m128(r, e);
        let r = _mm_mask_sqrt_ss(src, 0b11111111, a, b);
        let e = _mm_set_ps(1., 2., 10., 2.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_sqrt_ss() {
        let a = _mm_set_ps(1., 2., 10., 20.);
        let b = _mm_set_ps(3., 4., 30., 4.);
        let r = _mm_maskz_sqrt_ss(0, a, b);
        let e = _mm_set_ps(1., 2., 10., 0.);
        assert_eq_m128(r, e);
        let r = _mm_maskz_sqrt_ss(0b11111111, a, b);
        let e = _mm_set_ps(1., 2., 10., 2.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_sqrt_sd() {
        let src = _mm_set_pd(10., 11.);
        let a = _mm_set_pd(1., 2.);
        let b = _mm_set_pd(3., 4.);
        let r = _mm_mask_sqrt_sd(src, 0, a, b);
        let e = _mm_set_pd(1., 11.);
        assert_eq_m128d(r, e);
        let r = _mm_mask_sqrt_sd(src, 0b11111111, a, b);
        let e = _mm_set_pd(1., 2.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_sqrt_sd() {
        let a = _mm_set_pd(1., 2.);
        let b = _mm_set_pd(3., 4.);
        let r = _mm_maskz_sqrt_sd(0, a, b);
        let e = _mm_set_pd(1., 0.);
        assert_eq_m128d(r, e);
        let r = _mm_maskz_sqrt_sd(0b11111111, a, b);
        let e = _mm_set_pd(1., 2.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_rsqrt14_ss() {
        let a = _mm_set_ps(1., 2., 10., 20.);
        let b = _mm_set_ps(3., 4., 30., 4.);
        let r = _mm_rsqrt14_ss(a, b);
        let e = _mm_set_ps(1., 2., 10., 0.5);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_rsqrt14_ss() {
        let src = _mm_set_ps(10., 11., 100., 110.);
        let a = _mm_set_ps(1., 2., 10., 20.);
        let b = _mm_set_ps(3., 4., 30., 4.);
        let r = _mm_mask_rsqrt14_ss(src, 0, a, b);
        let e = _mm_set_ps(1., 2., 10., 110.);
        assert_eq_m128(r, e);
        let r = _mm_mask_rsqrt14_ss(src, 0b11111111, a, b);
        let e = _mm_set_ps(1., 2., 10., 0.5);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_rsqrt14_ss() {
        let a = _mm_set_ps(1., 2., 10., 20.);
        let b = _mm_set_ps(3., 4., 30., 4.);
        let r = _mm_maskz_rsqrt14_ss(0, a, b);
        let e = _mm_set_ps(1., 2., 10., 0.);
        assert_eq_m128(r, e);
        let r = _mm_maskz_rsqrt14_ss(0b11111111, a, b);
        let e = _mm_set_ps(1., 2., 10., 0.5);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_rsqrt14_sd() {
        let a = _mm_set_pd(1., 2.);
        let b = _mm_set_pd(3., 4.);
        let r = _mm_rsqrt14_sd(a, b);
        let e = _mm_set_pd(1., 0.5);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_rsqrt14_sd() {
        let src = _mm_set_pd(10., 11.);
        let a = _mm_set_pd(1., 2.);
        let b = _mm_set_pd(3., 4.);
        let r = _mm_mask_rsqrt14_sd(src, 0, a, b);
        let e = _mm_set_pd(1., 11.);
        assert_eq_m128d(r, e);
        let r = _mm_mask_rsqrt14_sd(src, 0b11111111, a, b);
        let e = _mm_set_pd(1., 0.5);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_rsqrt14_sd() {
        let a = _mm_set_pd(1., 2.);
        let b = _mm_set_pd(3., 4.);
        let r = _mm_maskz_rsqrt14_sd(0, a, b);
        let e = _mm_set_pd(1., 0.);
        assert_eq_m128d(r, e);
        let r = _mm_maskz_rsqrt14_sd(0b11111111, a, b);
        let e = _mm_set_pd(1., 0.5);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_rcp14_ss() {
        let a = _mm_set_ps(1., 2., 10., 20.);
        let b = _mm_set_ps(3., 4., 30., 4.);
        let r = _mm_rcp14_ss(a, b);
        let e = _mm_set_ps(1., 2., 10., 0.25);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_rcp14_ss() {
        let src = _mm_set_ps(10., 11., 100., 110.);
        let a = _mm_set_ps(1., 2., 10., 20.);
        let b = _mm_set_ps(3., 4., 30., 4.);
        let r = _mm_mask_rcp14_ss(src, 0, a, b);
        let e = _mm_set_ps(1., 2., 10., 110.);
        assert_eq_m128(r, e);
        let r = _mm_mask_rcp14_ss(src, 0b11111111, a, b);
        let e = _mm_set_ps(1., 2., 10., 0.25);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_rcp14_ss() {
        let a = _mm_set_ps(1., 2., 10., 20.);
        let b = _mm_set_ps(3., 4., 30., 4.);
        let r = _mm_maskz_rcp14_ss(0, a, b);
        let e = _mm_set_ps(1., 2., 10., 0.);
        assert_eq_m128(r, e);
        let r = _mm_maskz_rcp14_ss(0b11111111, a, b);
        let e = _mm_set_ps(1., 2., 10., 0.25);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_rcp14_sd() {
        let a = _mm_set_pd(1., 2.);
        let b = _mm_set_pd(3., 4.);
        let r = _mm_rcp14_sd(a, b);
        let e = _mm_set_pd(1., 0.25);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_rcp14_sd() {
        let src = _mm_set_pd(10., 11.);
        let a = _mm_set_pd(1., 2.);
        let b = _mm_set_pd(3., 4.);
        let r = _mm_mask_rcp14_sd(src, 0, a, b);
        let e = _mm_set_pd(1., 11.);
        assert_eq_m128d(r, e);
        let r = _mm_mask_rcp14_sd(src, 0b11111111, a, b);
        let e = _mm_set_pd(1., 0.25);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_rcp14_sd() {
        let a = _mm_set_pd(1., 2.);
        let b = _mm_set_pd(3., 4.);
        let r = _mm_maskz_rcp14_sd(0, a, b);
        let e = _mm_set_pd(1., 0.);
        assert_eq_m128d(r, e);
        let r = _mm_maskz_rcp14_sd(0b11111111, a, b);
        let e = _mm_set_pd(1., 0.25);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_getexp_ss() {
        let a = _mm_set1_ps(2.);
        let b = _mm_set1_ps(3.);
        let r = _mm_getexp_ss(a, b);
        let e = _mm_set_ps(2., 2., 2., 1.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_getexp_ss() {
        let a = _mm_set1_ps(2.);
        let b = _mm_set1_ps(3.);
        let r = _mm_mask_getexp_ss(a, 0, a, b);
        let e = _mm_set_ps(2., 2., 2., 2.);
        assert_eq_m128(r, e);
        let r = _mm_mask_getexp_ss(a, 0b11111111, a, b);
        let e = _mm_set_ps(2., 2., 2., 1.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_getexp_ss() {
        let a = _mm_set1_ps(2.);
        let b = _mm_set1_ps(3.);
        let r = _mm_maskz_getexp_ss(0, a, b);
        let e = _mm_set_ps(2., 2., 2., 0.);
        assert_eq_m128(r, e);
        let r = _mm_maskz_getexp_ss(0b11111111, a, b);
        let e = _mm_set_ps(2., 2., 2., 1.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_getexp_sd() {
        let a = _mm_set1_pd(2.);
        let b = _mm_set1_pd(3.);
        let r = _mm_getexp_sd(a, b);
        let e = _mm_set_pd(2., 1.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_getexp_sd() {
        let a = _mm_set1_pd(2.);
        let b = _mm_set1_pd(3.);
        let r = _mm_mask_getexp_sd(a, 0, a, b);
        let e = _mm_set_pd(2., 2.);
        assert_eq_m128d(r, e);
        let r = _mm_mask_getexp_sd(a, 0b11111111, a, b);
        let e = _mm_set_pd(2., 1.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_getexp_sd() {
        let a = _mm_set1_pd(2.);
        let b = _mm_set1_pd(3.);
        let r = _mm_maskz_getexp_sd(0, a, b);
        let e = _mm_set_pd(2., 0.);
        assert_eq_m128d(r, e);
        let r = _mm_maskz_getexp_sd(0b11111111, a, b);
        let e = _mm_set_pd(2., 1.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_getmant_ss() {
        let a = _mm_set1_ps(20.);
        let b = _mm_set1_ps(10.);
        let r = _mm_getmant_ss::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, b);
        let e = _mm_set_ps(20., 20., 20., 1.25);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_getmant_ss() {
        let a = _mm_set1_ps(20.);
        let b = _mm_set1_ps(10.);
        let r = _mm_mask_getmant_ss::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, 0, a, b);
        let e = _mm_set_ps(20., 20., 20., 20.);
        assert_eq_m128(r, e);
        let r = _mm_mask_getmant_ss::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, 0b11111111, a, b);
        let e = _mm_set_ps(20., 20., 20., 1.25);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_getmant_ss() {
        let a = _mm_set1_ps(20.);
        let b = _mm_set1_ps(10.);
        let r = _mm_maskz_getmant_ss::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0, a, b);
        let e = _mm_set_ps(20., 20., 20., 0.);
        assert_eq_m128(r, e);
        let r = _mm_maskz_getmant_ss::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0b11111111, a, b);
        let e = _mm_set_ps(20., 20., 20., 1.25);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_getmant_sd() {
        let a = _mm_set1_pd(20.);
        let b = _mm_set1_pd(10.);
        let r = _mm_getmant_sd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, b);
        let e = _mm_set_pd(20., 1.25);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_getmant_sd() {
        let a = _mm_set1_pd(20.);
        let b = _mm_set1_pd(10.);
        let r = _mm_mask_getmant_sd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, 0, a, b);
        let e = _mm_set_pd(20., 20.);
        assert_eq_m128d(r, e);
        let r = _mm_mask_getmant_sd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(a, 0b11111111, a, b);
        let e = _mm_set_pd(20., 1.25);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_getmant_sd() {
        let a = _mm_set1_pd(20.);
        let b = _mm_set1_pd(10.);
        let r = _mm_maskz_getmant_sd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0, a, b);
        let e = _mm_set_pd(20., 0.);
        assert_eq_m128d(r, e);
        let r = _mm_maskz_getmant_sd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC>(0b11111111, a, b);
        let e = _mm_set_pd(20., 1.25);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_roundscale_ss() {
        let a = _mm_set1_ps(2.2);
        let b = _mm_set1_ps(1.1);
        let r = _mm_roundscale_ss::<0>(a, b);
        let e = _mm_set_ps(2.2, 2.2, 2.2, 1.0);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_roundscale_ss() {
        let a = _mm_set1_ps(2.2);
        let b = _mm_set1_ps(1.1);
        let r = _mm_mask_roundscale_ss::<0>(a, 0, a, b);
        let e = _mm_set_ps(2.2, 2.2, 2.2, 2.2);
        assert_eq_m128(r, e);
        let r = _mm_mask_roundscale_ss::<0>(a, 0b11111111, a, b);
        let e = _mm_set_ps(2.2, 2.2, 2.2, 1.0);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_roundscale_ss() {
        let a = _mm_set1_ps(2.2);
        let b = _mm_set1_ps(1.1);
        let r = _mm_maskz_roundscale_ss::<0>(0, a, b);
        let e = _mm_set_ps(2.2, 2.2, 2.2, 0.0);
        assert_eq_m128(r, e);
        let r = _mm_maskz_roundscale_ss::<0>(0b11111111, a, b);
        let e = _mm_set_ps(2.2, 2.2, 2.2, 1.0);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_roundscale_sd() {
        let a = _mm_set1_pd(2.2);
        let b = _mm_set1_pd(1.1);
        let r = _mm_roundscale_sd::<0>(a, b);
        let e = _mm_set_pd(2.2, 1.0);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_roundscale_sd() {
        let a = _mm_set1_pd(2.2);
        let b = _mm_set1_pd(1.1);
        let r = _mm_mask_roundscale_sd::<0>(a, 0, a, b);
        let e = _mm_set_pd(2.2, 2.2);
        assert_eq_m128d(r, e);
        let r = _mm_mask_roundscale_sd::<0>(a, 0b11111111, a, b);
        let e = _mm_set_pd(2.2, 1.0);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_roundscale_sd() {
        let a = _mm_set1_pd(2.2);
        let b = _mm_set1_pd(1.1);
        let r = _mm_maskz_roundscale_sd::<0>(0, a, b);
        let e = _mm_set_pd(2.2, 0.0);
        assert_eq_m128d(r, e);
        let r = _mm_maskz_roundscale_sd::<0>(0b11111111, a, b);
        let e = _mm_set_pd(2.2, 1.0);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_scalef_ss() {
        let a = _mm_set1_ps(1.);
        let b = _mm_set1_ps(3.);
        let r = _mm_scalef_ss(a, b);
        let e = _mm_set_ps(1., 1., 1., 8.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_scalef_ss() {
        let a = _mm_set1_ps(1.);
        let b = _mm_set1_ps(3.);
        let r = _mm_mask_scalef_ss(a, 0, a, b);
        let e = _mm_set_ps(1., 1., 1., 1.);
        assert_eq_m128(r, e);
        let r = _mm_mask_scalef_ss(a, 0b11111111, a, b);
        let e = _mm_set_ps(1., 1., 1., 8.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_scalef_ss() {
        let a = _mm_set1_ps(1.);
        let b = _mm_set1_ps(3.);
        let r = _mm_maskz_scalef_ss(0, a, b);
        let e = _mm_set_ps(1., 1., 1., 0.);
        assert_eq_m128(r, e);
        let r = _mm_maskz_scalef_ss(0b11111111, a, b);
        let e = _mm_set_ps(1., 1., 1., 8.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_scalef_sd() {
        let a = _mm_set1_pd(1.);
        let b = _mm_set1_pd(3.);
        let r = _mm_scalef_sd(a, b);
        let e = _mm_set_pd(1., 8.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_scalef_sd() {
        let a = _mm_set1_pd(1.);
        let b = _mm_set1_pd(3.);
        let r = _mm_mask_scalef_sd(a, 0, a, b);
        let e = _mm_set_pd(1., 1.);
        assert_eq_m128d(r, e);
        let r = _mm_mask_scalef_sd(a, 0b11111111, a, b);
        let e = _mm_set_pd(1., 8.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_scalef_sd() {
        let a = _mm_set1_pd(1.);
        let b = _mm_set1_pd(3.);
        let r = _mm_maskz_scalef_sd(0, a, b);
        let e = _mm_set_pd(1., 0.);
        assert_eq_m128d(r, e);
        let r = _mm_maskz_scalef_sd(0b11111111, a, b);
        let e = _mm_set_pd(1., 8.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_fmadd_ss() {
        let a = _mm_set1_ps(1.);
        let b = _mm_set1_ps(2.);
        let c = _mm_set1_ps(3.);
        let r = _mm_mask_fmadd_ss(a, 0, b, c);
        assert_eq_m128(r, a);
        let r = _mm_mask_fmadd_ss(a, 0b11111111, b, c);
        let e = _mm_set_ps(1., 1., 1., 5.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_fmadd_ss() {
        let a = _mm_set1_ps(1.);
        let b = _mm_set1_ps(2.);
        let c = _mm_set1_ps(3.);
        let r = _mm_maskz_fmadd_ss(0, a, b, c);
        let e = _mm_set_ps(1., 1., 1., 0.);
        assert_eq_m128(r, e);
        let r = _mm_maskz_fmadd_ss(0b11111111, a, b, c);
        let e = _mm_set_ps(1., 1., 1., 5.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask3_fmadd_ss() {
        let a = _mm_set1_ps(1.);
        let b = _mm_set1_ps(2.);
        let c = _mm_set1_ps(3.);
        let r = _mm_mask3_fmadd_ss(a, b, c, 0);
        assert_eq_m128(r, c);
        let r = _mm_mask3_fmadd_ss(a, b, c, 0b11111111);
        let e = _mm_set_ps(3., 3., 3., 5.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_fmadd_sd() {
        let a = _mm_set1_pd(1.);
        let b = _mm_set1_pd(2.);
        let c = _mm_set1_pd(3.);
        let r = _mm_mask_fmadd_sd(a, 0, b, c);
        assert_eq_m128d(r, a);
        let r = _mm_mask_fmadd_sd(a, 0b11111111, b, c);
        let e = _mm_set_pd(1., 5.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_fmadd_sd() {
        let a = _mm_set1_pd(1.);
        let b = _mm_set1_pd(2.);
        let c = _mm_set1_pd(3.);
        let r = _mm_maskz_fmadd_sd(0, a, b, c);
        let e = _mm_set_pd(1., 0.);
        assert_eq_m128d(r, e);
        let r = _mm_maskz_fmadd_sd(0b11111111, a, b, c);
        let e = _mm_set_pd(1., 5.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask3_fmadd_sd() {
        let a = _mm_set1_pd(1.);
        let b = _mm_set1_pd(2.);
        let c = _mm_set1_pd(3.);
        let r = _mm_mask3_fmadd_sd(a, b, c, 0);
        assert_eq_m128d(r, c);
        let r = _mm_mask3_fmadd_sd(a, b, c, 0b11111111);
        let e = _mm_set_pd(3., 5.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_fmsub_ss() {
        let a = _mm_set1_ps(1.);
        let b = _mm_set1_ps(2.);
        let c = _mm_set1_ps(3.);
        let r = _mm_mask_fmsub_ss(a, 0, b, c);
        assert_eq_m128(r, a);
        let r = _mm_mask_fmsub_ss(a, 0b11111111, b, c);
        let e = _mm_set_ps(1., 1., 1., -1.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_fmsub_ss() {
        let a = _mm_set1_ps(1.);
        let b = _mm_set1_ps(2.);
        let c = _mm_set1_ps(3.);
        let r = _mm_maskz_fmsub_ss(0, a, b, c);
        let e = _mm_set_ps(1., 1., 1., 0.);
        assert_eq_m128(r, e);
        let r = _mm_maskz_fmsub_ss(0b11111111, a, b, c);
        let e = _mm_set_ps(1., 1., 1., -1.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask3_fmsub_ss() {
        let a = _mm_set1_ps(1.);
        let b = _mm_set1_ps(2.);
        let c = _mm_set1_ps(3.);
        let r = _mm_mask3_fmsub_ss(a, b, c, 0);
        assert_eq_m128(r, c);
        let r = _mm_mask3_fmsub_ss(a, b, c, 0b11111111);
        let e = _mm_set_ps(3., 3., 3., -1.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_fmsub_sd() {
        let a = _mm_set1_pd(1.);
        let b = _mm_set1_pd(2.);
        let c = _mm_set1_pd(3.);
        let r = _mm_mask_fmsub_sd(a, 0, b, c);
        assert_eq_m128d(r, a);
        let r = _mm_mask_fmsub_sd(a, 0b11111111, b, c);
        let e = _mm_set_pd(1., -1.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_fmsub_sd() {
        let a = _mm_set1_pd(1.);
        let b = _mm_set1_pd(2.);
        let c = _mm_set1_pd(3.);
        let r = _mm_maskz_fmsub_sd(0, a, b, c);
        let e = _mm_set_pd(1., 0.);
        assert_eq_m128d(r, e);
        let r = _mm_maskz_fmsub_sd(0b11111111, a, b, c);
        let e = _mm_set_pd(1., -1.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask3_fmsub_sd() {
        let a = _mm_set1_pd(1.);
        let b = _mm_set1_pd(2.);
        let c = _mm_set1_pd(3.);
        let r = _mm_mask3_fmsub_sd(a, b, c, 0);
        assert_eq_m128d(r, c);
        let r = _mm_mask3_fmsub_sd(a, b, c, 0b11111111);
        let e = _mm_set_pd(3., -1.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_fnmadd_ss() {
        let a = _mm_set1_ps(1.);
        let b = _mm_set1_ps(2.);
        let c = _mm_set1_ps(3.);
        let r = _mm_mask_fnmadd_ss(a, 0, b, c);
        assert_eq_m128(r, a);
        let r = _mm_mask_fnmadd_ss(a, 0b11111111, b, c);
        let e = _mm_set_ps(1., 1., 1., 1.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_fnmadd_ss() {
        let a = _mm_set1_ps(1.);
        let b = _mm_set1_ps(2.);
        let c = _mm_set1_ps(3.);
        let r = _mm_maskz_fnmadd_ss(0, a, b, c);
        let e = _mm_set_ps(1., 1., 1., 0.);
        assert_eq_m128(r, e);
        let r = _mm_maskz_fnmadd_ss(0b11111111, a, b, c);
        let e = _mm_set_ps(1., 1., 1., 1.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask3_fnmadd_ss() {
        let a = _mm_set1_ps(1.);
        let b = _mm_set1_ps(2.);
        let c = _mm_set1_ps(3.);
        let r = _mm_mask3_fnmadd_ss(a, b, c, 0);
        assert_eq_m128(r, c);
        let r = _mm_mask3_fnmadd_ss(a, b, c, 0b11111111);
        let e = _mm_set_ps(3., 3., 3., 1.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_fnmadd_sd() {
        let a = _mm_set1_pd(1.);
        let b = _mm_set1_pd(2.);
        let c = _mm_set1_pd(3.);
        let r = _mm_mask_fnmadd_sd(a, 0, b, c);
        assert_eq_m128d(r, a);
        let r = _mm_mask_fnmadd_sd(a, 0b11111111, b, c);
        let e = _mm_set_pd(1., 1.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_fnmadd_sd() {
        let a = _mm_set1_pd(1.);
        let b = _mm_set1_pd(2.);
        let c = _mm_set1_pd(3.);
        let r = _mm_maskz_fnmadd_sd(0, a, b, c);
        let e = _mm_set_pd(1., 0.);
        assert_eq_m128d(r, e);
        let r = _mm_maskz_fnmadd_sd(0b11111111, a, b, c);
        let e = _mm_set_pd(1., 1.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask3_fnmadd_sd() {
        let a = _mm_set1_pd(1.);
        let b = _mm_set1_pd(2.);
        let c = _mm_set1_pd(3.);
        let r = _mm_mask3_fnmadd_sd(a, b, c, 0);
        assert_eq_m128d(r, c);
        let r = _mm_mask3_fnmadd_sd(a, b, c, 0b11111111);
        let e = _mm_set_pd(3., 1.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_fnmsub_ss() {
        let a = _mm_set1_ps(1.);
        let b = _mm_set1_ps(2.);
        let c = _mm_set1_ps(3.);
        let r = _mm_mask_fnmsub_ss(a, 0, b, c);
        assert_eq_m128(r, a);
        let r = _mm_mask_fnmsub_ss(a, 0b11111111, b, c);
        let e = _mm_set_ps(1., 1., 1., -5.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_fnmsub_ss() {
        let a = _mm_set1_ps(1.);
        let b = _mm_set1_ps(2.);
        let c = _mm_set1_ps(3.);
        let r = _mm_maskz_fnmsub_ss(0, a, b, c);
        let e = _mm_set_ps(1., 1., 1., 0.);
        assert_eq_m128(r, e);
        let r = _mm_maskz_fnmsub_ss(0b11111111, a, b, c);
        let e = _mm_set_ps(1., 1., 1., -5.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask3_fnmsub_ss() {
        let a = _mm_set1_ps(1.);
        let b = _mm_set1_ps(2.);
        let c = _mm_set1_ps(3.);
        let r = _mm_mask3_fnmsub_ss(a, b, c, 0);
        assert_eq_m128(r, c);
        let r = _mm_mask3_fnmsub_ss(a, b, c, 0b11111111);
        let e = _mm_set_ps(3., 3., 3., -5.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_fnmsub_sd() {
        let a = _mm_set1_pd(1.);
        let b = _mm_set1_pd(2.);
        let c = _mm_set1_pd(3.);
        let r = _mm_mask_fnmsub_sd(a, 0, b, c);
        assert_eq_m128d(r, a);
        let r = _mm_mask_fnmsub_sd(a, 0b11111111, b, c);
        let e = _mm_set_pd(1., -5.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_fnmsub_sd() {
        let a = _mm_set1_pd(1.);
        let b = _mm_set1_pd(2.);
        let c = _mm_set1_pd(3.);
        let r = _mm_maskz_fnmsub_sd(0, a, b, c);
        let e = _mm_set_pd(1., 0.);
        assert_eq_m128d(r, e);
        let r = _mm_maskz_fnmsub_sd(0b11111111, a, b, c);
        let e = _mm_set_pd(1., -5.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask3_fnmsub_sd() {
        let a = _mm_set1_pd(1.);
        let b = _mm_set1_pd(2.);
        let c = _mm_set1_pd(3.);
        let r = _mm_mask3_fnmsub_sd(a, b, c, 0);
        assert_eq_m128d(r, c);
        let r = _mm_mask3_fnmsub_sd(a, b, c, 0b11111111);
        let e = _mm_set_pd(3., -5.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_add_round_ss() {
        let a = _mm_set_ps(1., 2., 10., 20.);
        let b = _mm_set_ps(3., 4., 30., 40.);
        let r = _mm_add_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
        let e = _mm_set_ps(1., 2., 10., 60.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_add_round_ss() {
        let src = _mm_set_ps(10., 11., 100., 110.);
        let a = _mm_set_ps(1., 2., 10., 20.);
        let b = _mm_set_ps(3., 4., 30., 40.);
        let r = _mm_mask_add_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
        let e = _mm_set_ps(1., 2., 10., 110.);
        assert_eq_m128(r, e);
        let r = _mm_mask_add_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
            src, 0b11111111, a, b,
        );
        let e = _mm_set_ps(1., 2., 10., 60.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_add_round_ss() {
        let a = _mm_set_ps(1., 2., 10., 20.);
        let b = _mm_set_ps(3., 4., 30., 40.);
        let r = _mm_maskz_add_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
        let e = _mm_set_ps(1., 2., 10., 0.);
        assert_eq_m128(r, e);
        let r =
            _mm_maskz_add_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
        let e = _mm_set_ps(1., 2., 10., 60.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_add_round_sd() {
        let a = _mm_set_pd(1., 2.);
        let b = _mm_set_pd(3., 4.);
        let r = _mm_add_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
        let e = _mm_set_pd(1., 6.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_add_round_sd() {
        let src = _mm_set_pd(10., 11.);
        let a = _mm_set_pd(1., 2.);
        let b = _mm_set_pd(3., 4.);
        let r = _mm_mask_add_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
        let e = _mm_set_pd(1., 11.);
        assert_eq_m128d(r, e);
        let r = _mm_mask_add_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
            src, 0b11111111, a, b,
        );
        let e = _mm_set_pd(1., 6.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_add_round_sd() {
        let a = _mm_set_pd(1., 2.);
        let b = _mm_set_pd(3., 4.);
        let r = _mm_maskz_add_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
        let e = _mm_set_pd(1., 0.);
        assert_eq_m128d(r, e);
        let r =
            _mm_maskz_add_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
        let e = _mm_set_pd(1., 6.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_sub_round_ss() {
        let a = _mm_set_ps(1., 2., 10., 20.);
        let b = _mm_set_ps(3., 4., 30., 40.);
        let r = _mm_sub_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
        let e = _mm_set_ps(1., 2., 10., -20.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_sub_round_ss() {
        let src = _mm_set_ps(10., 11., 100., 110.);
        let a = _mm_set_ps(1., 2., 10., 20.);
        let b = _mm_set_ps(3., 4., 30., 40.);
        let r = _mm_mask_sub_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
        let e = _mm_set_ps(1., 2., 10., 110.);
        assert_eq_m128(r, e);
        let r = _mm_mask_sub_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
            src, 0b11111111, a, b,
        );
        let e = _mm_set_ps(1., 2., 10., -20.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_sub_round_ss() {
        let a = _mm_set_ps(1., 2., 10., 20.);
        let b = _mm_set_ps(3., 4., 30., 40.);
        let r = _mm_maskz_sub_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
        let e = _mm_set_ps(1., 2., 10., 0.);
        assert_eq_m128(r, e);
        let r =
            _mm_maskz_sub_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
        let e = _mm_set_ps(1., 2., 10., -20.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_sub_round_sd() {
        let a = _mm_set_pd(1., 2.);
        let b = _mm_set_pd(3., 4.);
        let r = _mm_sub_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
        let e = _mm_set_pd(1., -2.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_sub_round_sd() {
        let src = _mm_set_pd(10., 11.);
        let a = _mm_set_pd(1., 2.);
        let b = _mm_set_pd(3., 4.);
        let r = _mm_mask_sub_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
        let e = _mm_set_pd(1., 11.);
        assert_eq_m128d(r, e);
        let r = _mm_mask_sub_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
            src, 0b11111111, a, b,
        );
        let e = _mm_set_pd(1., -2.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_sub_round_sd() {
        let a = _mm_set_pd(1., 2.);
        let b = _mm_set_pd(3., 4.);
        let r = _mm_maskz_sub_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
        let e = _mm_set_pd(1., 0.);
        assert_eq_m128d(r, e);
        let r =
            _mm_maskz_sub_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
        let e = _mm_set_pd(1., -2.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mul_round_ss() {
        let a = _mm_set_ps(1., 2., 10., 20.);
        let b = _mm_set_ps(3., 4., 30., 40.);
        let r = _mm_mul_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
        let e = _mm_set_ps(1., 2., 10., 800.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_mul_round_ss() {
        let src = _mm_set_ps(10., 11., 100., 110.);
        let a = _mm_set_ps(1., 2., 10., 20.);
        let b = _mm_set_ps(3., 4., 30., 40.);
        let r = _mm_mask_mul_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
        let e = _mm_set_ps(1., 2., 10., 110.);
        assert_eq_m128(r, e);
        let r = _mm_mask_mul_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
            src, 0b11111111, a, b,
        );
        let e = _mm_set_ps(1., 2., 10., 800.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_mul_round_ss() {
        let a = _mm_set_ps(1., 2., 10., 20.);
        let b = _mm_set_ps(3., 4., 30., 40.);
        let r = _mm_maskz_mul_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
        let e = _mm_set_ps(1., 2., 10., 0.);
        assert_eq_m128(r, e);
        let r =
            _mm_maskz_mul_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
        let e = _mm_set_ps(1., 2., 10., 800.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mul_round_sd() {
        let a = _mm_set_pd(1., 2.);
        let b = _mm_set_pd(3., 4.);
        let r = _mm_mul_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
        let e = _mm_set_pd(1., 8.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_mul_round_sd() {
        let src = _mm_set_pd(10., 11.);
        let a = _mm_set_pd(1., 2.);
        let b = _mm_set_pd(3., 4.);
        let r = _mm_mask_mul_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
        let e = _mm_set_pd(1., 11.);
        assert_eq_m128d(r, e);
        let r = _mm_mask_mul_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
            src, 0b11111111, a, b,
        );
        let e = _mm_set_pd(1., 8.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_mul_round_sd() {
        let a = _mm_set_pd(1., 2.);
        let b = _mm_set_pd(3., 4.);
        let r = _mm_maskz_mul_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
        let e = _mm_set_pd(1., 0.);
        assert_eq_m128d(r, e);
        let r =
            _mm_maskz_mul_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
        let e = _mm_set_pd(1., 8.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_div_round_ss() {
        let a = _mm_set_ps(1., 2., 10., 20.);
        let b = _mm_set_ps(3., 4., 30., 40.);
        let r = _mm_div_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
        let e = _mm_set_ps(1., 2., 10., 0.5);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_div_round_ss() {
        let src = _mm_set_ps(10., 11., 100., 110.);
        let a = _mm_set_ps(1., 2., 10., 20.);
        let b = _mm_set_ps(3., 4., 30., 40.);
        let r = _mm_mask_div_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
        let e = _mm_set_ps(1., 2., 10., 110.);
        assert_eq_m128(r, e);
        let r = _mm_mask_div_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
            src, 0b11111111, a, b,
        );
        let e = _mm_set_ps(1., 2., 10., 0.5);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_div_round_ss() {
        let a = _mm_set_ps(1., 2., 10., 20.);
        let b = _mm_set_ps(3., 4., 30., 40.);
        let r = _mm_maskz_div_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
        let e = _mm_set_ps(1., 2., 10., 0.);
        assert_eq_m128(r, e);
        let r =
            _mm_maskz_div_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
        let e = _mm_set_ps(1., 2., 10., 0.5);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_div_round_sd() {
        let a = _mm_set_pd(1., 2.);
        let b = _mm_set_pd(3., 4.);
        let r = _mm_div_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
        let e = _mm_set_pd(1., 0.5);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_div_round_sd() {
        let src = _mm_set_pd(10., 11.);
        let a = _mm_set_pd(1., 2.);
        let b = _mm_set_pd(3., 4.);
        let r = _mm_mask_div_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
        let e = _mm_set_pd(1., 11.);
        assert_eq_m128d(r, e);
        let r = _mm_mask_div_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
            src, 0b11111111, a, b,
        );
        let e = _mm_set_pd(1., 0.5);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_div_round_sd() {
        let a = _mm_set_pd(1., 2.);
        let b = _mm_set_pd(3., 4.);
        let r = _mm_maskz_div_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
        let e = _mm_set_pd(1., 0.);
        assert_eq_m128d(r, e);
        let r =
            _mm_maskz_div_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
        let e = _mm_set_pd(1., 0.5);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_max_round_ss() {
        let a = _mm_set_ps(0., 1., 2., 3.);
        let b = _mm_set_ps(4., 5., 6., 7.);
        let r = _mm_max_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, b);
        let e = _mm_set_ps(0., 1., 2., 7.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_max_round_ss() {
        let a = _mm_set_ps(0., 1., 2., 3.);
        let b = _mm_set_ps(4., 5., 6., 7.);
        let r = _mm_mask_max_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
        let e = _mm_set_ps(0., 1., 2., 3.);
        assert_eq_m128(r, e);
        let r = _mm_mask_max_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
        let e = _mm_set_ps(0., 1., 2., 7.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_max_round_ss() {
        let a = _mm_set_ps(0., 1., 2., 3.);
        let b = _mm_set_ps(4., 5., 6., 7.);
        let r = _mm_maskz_max_round_ss::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
        let e = _mm_set_ps(0., 1., 2., 0.);
        assert_eq_m128(r, e);
        let r = _mm_maskz_max_round_ss::<_MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
        let e = _mm_set_ps(0., 1., 2., 7.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_max_round_sd() {
        let a = _mm_set_pd(0., 1.);
        let b = _mm_set_pd(2., 3.);
        let r = _mm_max_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, b);
        let e = _mm_set_pd(0., 3.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_max_round_sd() {
        let a = _mm_set_pd(0., 1.);
        let b = _mm_set_pd(2., 3.);
        let r = _mm_mask_max_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
        let e = _mm_set_pd(0., 1.);
        assert_eq_m128d(r, e);
        let r = _mm_mask_max_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
        let e = _mm_set_pd(0., 3.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_max_round_sd() {
        let a = _mm_set_pd(0., 1.);
        let b = _mm_set_pd(2., 3.);
        let r = _mm_maskz_max_round_sd::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
        let e = _mm_set_pd(0., 0.);
        assert_eq_m128d(r, e);
        let r = _mm_maskz_max_round_sd::<_MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
        let e = _mm_set_pd(0., 3.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_min_round_ss() {
        let a = _mm_set_ps(0., 1., 2., 3.);
        let b = _mm_set_ps(4., 5., 6., 7.);
        let r = _mm_min_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, b);
        let e = _mm_set_ps(0., 1., 2., 3.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_min_round_ss() {
        let a = _mm_set_ps(0., 1., 2., 3.);
        let b = _mm_set_ps(4., 5., 6., 7.);
        let r = _mm_mask_min_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
        let e = _mm_set_ps(0., 1., 2., 3.);
        assert_eq_m128(r, e);
        let r = _mm_mask_min_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
        let e = _mm_set_ps(0., 1., 2., 3.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_min_round_ss() {
        let a = _mm_set_ps(0., 1., 2., 3.);
        let b = _mm_set_ps(4., 5., 6., 7.);
        let r = _mm_maskz_min_round_ss::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
        let e = _mm_set_ps(0., 1., 2., 0.);
        assert_eq_m128(r, e);
        let r = _mm_maskz_min_round_ss::<_MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
        let e = _mm_set_ps(0., 1., 2., 3.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_min_round_sd() {
        let a = _mm_set_pd(0., 1.);
        let b = _mm_set_pd(2., 3.);
        let r = _mm_min_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, b);
        let e = _mm_set_pd(0., 1.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_min_round_sd() {
        let a = _mm_set_pd(0., 1.);
        let b = _mm_set_pd(2., 3.);
        let r = _mm_mask_min_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
        let e = _mm_set_pd(0., 1.);
        assert_eq_m128d(r, e);
        let r = _mm_mask_min_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
        let e = _mm_set_pd(0., 1.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_min_round_sd() {
        let a = _mm_set_pd(0., 1.);
        let b = _mm_set_pd(2., 3.);
        let r = _mm_maskz_min_round_sd::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
        let e = _mm_set_pd(0., 0.);
        assert_eq_m128d(r, e);
        let r = _mm_maskz_min_round_sd::<_MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
        let e = _mm_set_pd(0., 1.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_sqrt_round_ss() {
        let a = _mm_set_ps(1., 2., 10., 20.);
        let b = _mm_set_ps(3., 4., 30., 4.);
        let r = _mm_sqrt_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
        let e = _mm_set_ps(1., 2., 10., 2.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_sqrt_round_ss() {
        let src = _mm_set_ps(10., 11., 100., 110.);
        let a = _mm_set_ps(1., 2., 10., 20.);
        let b = _mm_set_ps(3., 4., 30., 4.);
        let r = _mm_mask_sqrt_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
        let e = _mm_set_ps(1., 2., 10., 110.);
        assert_eq_m128(r, e);
        let r = _mm_mask_sqrt_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
            src, 0b11111111, a, b,
        );
        let e = _mm_set_ps(1., 2., 10., 2.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_sqrt_round_ss() {
        let a = _mm_set_ps(1., 2., 10., 20.);
        let b = _mm_set_ps(3., 4., 30., 4.);
        let r = _mm_maskz_sqrt_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
        let e = _mm_set_ps(1., 2., 10., 0.);
        assert_eq_m128(r, e);
        let r =
            _mm_maskz_sqrt_round_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
        let e = _mm_set_ps(1., 2., 10., 2.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_sqrt_round_sd() {
        let a = _mm_set_pd(1., 2.);
        let b = _mm_set_pd(3., 4.);
        let r = _mm_sqrt_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
        let e = _mm_set_pd(1., 2.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_sqrt_round_sd() {
        let src = _mm_set_pd(10., 11.);
        let a = _mm_set_pd(1., 2.);
        let b = _mm_set_pd(3., 4.);
        let r = _mm_mask_sqrt_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(src, 0, a, b);
        let e = _mm_set_pd(1., 11.);
        assert_eq_m128d(r, e);
        let r = _mm_mask_sqrt_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
            src, 0b11111111, a, b,
        );
        let e = _mm_set_pd(1., 2.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_sqrt_round_sd() {
        let a = _mm_set_pd(1., 2.);
        let b = _mm_set_pd(3., 4.);
        let r = _mm_maskz_sqrt_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
        let e = _mm_set_pd(1., 0.);
        assert_eq_m128d(r, e);
        let r =
            _mm_maskz_sqrt_round_sd::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0b11111111, a, b);
        let e = _mm_set_pd(1., 2.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_getexp_round_ss() {
        let a = _mm_set1_ps(2.);
        let b = _mm_set1_ps(3.);
        let r = _mm_getexp_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, b);
        let e = _mm_set_ps(2., 2., 2., 1.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_getexp_round_ss() {
        let a = _mm_set1_ps(2.);
        let b = _mm_set1_ps(3.);
        let r = _mm_mask_getexp_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
        let e = _mm_set_ps(2., 2., 2., 2.);
        assert_eq_m128(r, e);
        let r = _mm_mask_getexp_round_ss::<_MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
        let e = _mm_set_ps(2., 2., 2., 1.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_getexp_round_ss() {
        let a = _mm_set1_ps(2.);
        let b = _mm_set1_ps(3.);
        let r = _mm_maskz_getexp_round_ss::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
        let e = _mm_set_ps(2., 2., 2., 0.);
        assert_eq_m128(r, e);
        let r = _mm_maskz_getexp_round_ss::<_MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
        let e = _mm_set_ps(2., 2., 2., 1.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_getexp_round_sd() {
        let a = _mm_set1_pd(2.);
        let b = _mm_set1_pd(3.);
        let r = _mm_getexp_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, b);
        let e = _mm_set_pd(2., 1.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_getexp_round_sd() {
        let a = _mm_set1_pd(2.);
        let b = _mm_set1_pd(3.);
        let r = _mm_mask_getexp_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
        let e = _mm_set_pd(2., 2.);
        assert_eq_m128d(r, e);
        let r = _mm_mask_getexp_round_sd::<_MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
        let e = _mm_set_pd(2., 1.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_getexp_round_sd() {
        let a = _mm_set1_pd(2.);
        let b = _mm_set1_pd(3.);
        let r = _mm_maskz_getexp_round_sd::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
        let e = _mm_set_pd(2., 0.);
        assert_eq_m128d(r, e);
        let r = _mm_maskz_getexp_round_sd::<_MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
        let e = _mm_set_pd(2., 1.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_getmant_round_ss() {
        let a = _mm_set1_ps(20.);
        let b = _mm_set1_ps(10.);
        let r =
            _mm_getmant_round_ss::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC, _MM_FROUND_CUR_DIRECTION>(
                a, b,
            );
        let e = _mm_set_ps(20., 20., 20., 1.25);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_getmant_round_ss() {
        let a = _mm_set1_ps(20.);
        let b = _mm_set1_ps(10.);
        let r = _mm_mask_getmant_round_ss::<
            _MM_MANT_NORM_1_2,
            _MM_MANT_SIGN_SRC,
            _MM_FROUND_CUR_DIRECTION,
        >(a, 0, a, b);
        let e = _mm_set_ps(20., 20., 20., 20.);
        assert_eq_m128(r, e);
        let r = _mm_mask_getmant_round_ss::<
            _MM_MANT_NORM_1_2,
            _MM_MANT_SIGN_SRC,
            _MM_FROUND_CUR_DIRECTION,
        >(a, 0b11111111, a, b);
        let e = _mm_set_ps(20., 20., 20., 1.25);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_getmant_round_ss() {
        let a = _mm_set1_ps(20.);
        let b = _mm_set1_ps(10.);
        let r = _mm_maskz_getmant_round_ss::<
            _MM_MANT_NORM_1_2,
            _MM_MANT_SIGN_SRC,
            _MM_FROUND_CUR_DIRECTION,
        >(0, a, b);
        let e = _mm_set_ps(20., 20., 20., 0.);
        assert_eq_m128(r, e);
        let r = _mm_maskz_getmant_round_ss::<
            _MM_MANT_NORM_1_2,
            _MM_MANT_SIGN_SRC,
            _MM_FROUND_CUR_DIRECTION,
        >(0b11111111, a, b);
        let e = _mm_set_ps(20., 20., 20., 1.25);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_getmant_round_sd() {
        let a = _mm_set1_pd(20.);
        let b = _mm_set1_pd(10.);
        let r =
            _mm_getmant_round_sd::<_MM_MANT_NORM_1_2, _MM_MANT_SIGN_SRC, _MM_FROUND_CUR_DIRECTION>(
                a, b,
            );
        let e = _mm_set_pd(20., 1.25);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_getmant_round_sd() {
        let a = _mm_set1_pd(20.);
        let b = _mm_set1_pd(10.);
        let r = _mm_mask_getmant_round_sd::<
            _MM_MANT_NORM_1_2,
            _MM_MANT_SIGN_SRC,
            _MM_FROUND_CUR_DIRECTION,
        >(a, 0, a, b);
        let e = _mm_set_pd(20., 20.);
        assert_eq_m128d(r, e);
        let r = _mm_mask_getmant_round_sd::<
            _MM_MANT_NORM_1_2,
            _MM_MANT_SIGN_SRC,
            _MM_FROUND_CUR_DIRECTION,
        >(a, 0b11111111, a, b);
        let e = _mm_set_pd(20., 1.25);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_getmant_round_sd() {
        let a = _mm_set1_pd(20.);
        let b = _mm_set1_pd(10.);
        let r = _mm_maskz_getmant_round_sd::<
            _MM_MANT_NORM_1_2,
            _MM_MANT_SIGN_SRC,
            _MM_FROUND_CUR_DIRECTION,
        >(0, a, b);
        let e = _mm_set_pd(20., 0.);
        assert_eq_m128d(r, e);
        let r = _mm_maskz_getmant_round_sd::<
            _MM_MANT_NORM_1_2,
            _MM_MANT_SIGN_SRC,
            _MM_FROUND_CUR_DIRECTION,
        >(0b11111111, a, b);
        let e = _mm_set_pd(20., 1.25);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_roundscale_round_ss() {
        let a = _mm_set1_ps(2.2);
        let b = _mm_set1_ps(1.1);
        let r = _mm_roundscale_round_ss::<0, _MM_FROUND_CUR_DIRECTION>(a, b);
        let e = _mm_set_ps(2.2, 2.2, 2.2, 1.0);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_roundscale_round_ss() {
        let a = _mm_set1_ps(2.2);
        let b = _mm_set1_ps(1.1);
        let r = _mm_mask_roundscale_round_ss::<0, _MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
        let e = _mm_set_ps(2.2, 2.2, 2.2, 2.2);
        assert_eq_m128(r, e);
        let r = _mm_mask_roundscale_round_ss::<0, _MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
        let e = _mm_set_ps(2.2, 2.2, 2.2, 1.0);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_roundscale_round_ss() {
        let a = _mm_set1_ps(2.2);
        let b = _mm_set1_ps(1.1);
        let r = _mm_maskz_roundscale_round_ss::<0, _MM_FROUND_CUR_DIRECTION>(0, a, b);
        let e = _mm_set_ps(2.2, 2.2, 2.2, 0.0);
        assert_eq_m128(r, e);
        let r = _mm_maskz_roundscale_round_ss::<0, _MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
        let e = _mm_set_ps(2.2, 2.2, 2.2, 1.0);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_roundscale_round_sd() {
        let a = _mm_set1_pd(2.2);
        let b = _mm_set1_pd(1.1);
        let r = _mm_roundscale_round_sd::<0, _MM_FROUND_CUR_DIRECTION>(a, b);
        let e = _mm_set_pd(2.2, 1.0);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_roundscale_round_sd() {
        let a = _mm_set1_pd(2.2);
        let b = _mm_set1_pd(1.1);
        let r = _mm_mask_roundscale_round_sd::<0, _MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
        let e = _mm_set_pd(2.2, 2.2);
        assert_eq_m128d(r, e);
        let r = _mm_mask_roundscale_round_sd::<0, _MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
        let e = _mm_set_pd(2.2, 1.0);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_roundscale_round_sd() {
        let a = _mm_set1_pd(2.2);
        let b = _mm_set1_pd(1.1);
        let r = _mm_maskz_roundscale_round_sd::<0, _MM_FROUND_CUR_DIRECTION>(0, a, b);
        let e = _mm_set_pd(2.2, 0.0);
        assert_eq_m128d(r, e);
        let r = _mm_maskz_roundscale_round_sd::<0, _MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
        let e = _mm_set_pd(2.2, 1.0);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_scalef_round_ss() {
        let a = _mm_set1_ps(1.);
        let b = _mm_set1_ps(3.);
        let r = _mm_scalef_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
        let e = _mm_set_ps(1., 1., 1., 8.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_scalef_round_ss() {
        let a = _mm_set1_ps(1.);
        let b = _mm_set1_ps(3.);
        let r = _mm_mask_scalef_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a, 0, a, b,
        );
        let e = _mm_set_ps(1., 1., 1., 1.);
        assert_eq_m128(r, e);
        let r = _mm_mask_scalef_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a, 0b11111111, a, b,
        );
        let e = _mm_set_ps(1., 1., 1., 8.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_scalef_round_ss() {
        let a = _mm_set1_ps(1.);
        let b = _mm_set1_ps(3.);
        let r =
            _mm_maskz_scalef_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
        let e = _mm_set_ps(1., 1., 1., 0.);
        assert_eq_m128(r, e);
        let r = _mm_maskz_scalef_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            0b11111111, a, b,
        );
        let e = _mm_set_ps(1., 1., 1., 8.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_scalef_round_sd() {
        let a = _mm_set1_pd(1.);
        let b = _mm_set1_pd(3.);
        let r = _mm_scalef_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b);
        let e = _mm_set_pd(1., 8.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_scalef_round_sd() {
        let a = _mm_set1_pd(1.);
        let b = _mm_set1_pd(3.);
        let r = _mm_mask_scalef_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a, 0, a, b,
        );
        let e = _mm_set_pd(1., 1.);
        assert_eq_m128d(r, e);
        let r = _mm_mask_scalef_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a, 0b11111111, a, b,
        );
        let e = _mm_set_pd(1., 8.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_scalef_round_sd() {
        let a = _mm_set1_pd(1.);
        let b = _mm_set1_pd(3.);
        let r =
            _mm_maskz_scalef_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(0, a, b);
        let e = _mm_set_pd(1., 0.);
        assert_eq_m128d(r, e);
        let r = _mm_maskz_scalef_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            0b11111111, a, b,
        );
        let e = _mm_set_pd(1., 8.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_fmadd_round_ss() {
        let a = _mm_set1_ps(1.);
        let b = _mm_set1_ps(2.);
        let c = _mm_set1_ps(3.);
        let r = _mm_fmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
        let e = _mm_set_ps(1., 1., 1., 5.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_fmadd_round_ss() {
        let a = _mm_set1_ps(1.);
        let b = _mm_set1_ps(2.);
        let c = _mm_set1_ps(3.);
        let r = _mm_mask_fmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a, 0, b, c,
        );
        assert_eq_m128(r, a);
        let r = _mm_mask_fmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a, 0b11111111, b, c,
        );
        let e = _mm_set_ps(1., 1., 1., 5.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_fmadd_round_ss() {
        let a = _mm_set1_ps(1.);
        let b = _mm_set1_ps(2.);
        let c = _mm_set1_ps(3.);
        let r = _mm_maskz_fmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            0, a, b, c,
        );
        let e = _mm_set_ps(1., 1., 1., 0.);
        assert_eq_m128(r, e);
        let r = _mm_maskz_fmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            0b11111111, a, b, c,
        );
        let e = _mm_set_ps(1., 1., 1., 5.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask3_fmadd_round_ss() {
        let a = _mm_set1_ps(1.);
        let b = _mm_set1_ps(2.);
        let c = _mm_set1_ps(3.);
        let r = _mm_mask3_fmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a, b, c, 0,
        );
        assert_eq_m128(r, c);
        let r = _mm_mask3_fmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a, b, c, 0b11111111,
        );
        let e = _mm_set_ps(3., 3., 3., 5.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_fmadd_round_sd() {
        let a = _mm_set1_pd(1.);
        let b = _mm_set1_pd(2.);
        let c = _mm_set1_pd(3.);
        let r = _mm_fmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
        let e = _mm_set_pd(1., 5.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_fmadd_round_sd() {
        let a = _mm_set1_pd(1.);
        let b = _mm_set1_pd(2.);
        let c = _mm_set1_pd(3.);
        let r = _mm_mask_fmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a, 0, b, c,
        );
        assert_eq_m128d(r, a);
        let r = _mm_mask_fmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a, 0b11111111, b, c,
        );
        let e = _mm_set_pd(1., 5.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_fmadd_round_sd() {
        let a = _mm_set1_pd(1.);
        let b = _mm_set1_pd(2.);
        let c = _mm_set1_pd(3.);
        let r = _mm_maskz_fmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            0, a, b, c,
        );
        let e = _mm_set_pd(1., 0.);
        assert_eq_m128d(r, e);
        let r = _mm_maskz_fmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            0b11111111, a, b, c,
        );
        let e = _mm_set_pd(1., 5.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask3_fmadd_round_sd() {
        let a = _mm_set1_pd(1.);
        let b = _mm_set1_pd(2.);
        let c = _mm_set1_pd(3.);
        let r = _mm_mask3_fmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a, b, c, 0,
        );
        assert_eq_m128d(r, c);
        let r = _mm_mask3_fmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a, b, c, 0b11111111,
        );
        let e = _mm_set_pd(3., 5.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_fmsub_round_ss() {
        let a = _mm_set1_ps(1.);
        let b = _mm_set1_ps(2.);
        let c = _mm_set1_ps(3.);
        let r = _mm_fmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
        let e = _mm_set_ps(1., 1., 1., -1.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_fmsub_round_ss() {
        let a = _mm_set1_ps(1.);
        let b = _mm_set1_ps(2.);
        let c = _mm_set1_ps(3.);
        let r = _mm_mask_fmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a, 0, b, c,
        );
        assert_eq_m128(r, a);
        let r = _mm_mask_fmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a, 0b11111111, b, c,
        );
        let e = _mm_set_ps(1., 1., 1., -1.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_fmsub_round_ss() {
        let a = _mm_set1_ps(1.);
        let b = _mm_set1_ps(2.);
        let c = _mm_set1_ps(3.);
        let r = _mm_maskz_fmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            0, a, b, c,
        );
        let e = _mm_set_ps(1., 1., 1., 0.);
        assert_eq_m128(r, e);
        let r = _mm_maskz_fmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            0b11111111, a, b, c,
        );
        let e = _mm_set_ps(1., 1., 1., -1.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask3_fmsub_round_ss() {
        let a = _mm_set1_ps(1.);
        let b = _mm_set1_ps(2.);
        let c = _mm_set1_ps(3.);
        let r = _mm_mask3_fmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a, b, c, 0,
        );
        assert_eq_m128(r, c);
        let r = _mm_mask3_fmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a, b, c, 0b11111111,
        );
        let e = _mm_set_ps(3., 3., 3., -1.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_fmsub_round_sd() {
        let a = _mm_set1_pd(1.);
        let b = _mm_set1_pd(2.);
        let c = _mm_set1_pd(3.);
        let r = _mm_fmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
        let e = _mm_set_pd(1., -1.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_fmsub_round_sd() {
        let a = _mm_set1_pd(1.);
        let b = _mm_set1_pd(2.);
        let c = _mm_set1_pd(3.);
        let r = _mm_mask_fmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a, 0, b, c,
        );
        assert_eq_m128d(r, a);
        let r = _mm_mask_fmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a, 0b11111111, b, c,
        );
        let e = _mm_set_pd(1., -1.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_fmsub_round_sd() {
        let a = _mm_set1_pd(1.);
        let b = _mm_set1_pd(2.);
        let c = _mm_set1_pd(3.);
        let r = _mm_maskz_fmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            0, a, b, c,
        );
        let e = _mm_set_pd(1., 0.);
        assert_eq_m128d(r, e);
        let r = _mm_maskz_fmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            0b11111111, a, b, c,
        );
        let e = _mm_set_pd(1., -1.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask3_fmsub_round_sd() {
        let a = _mm_set1_pd(1.);
        let b = _mm_set1_pd(2.);
        let c = _mm_set1_pd(3.);
        let r = _mm_mask3_fmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a, b, c, 0,
        );
        assert_eq_m128d(r, c);
        let r = _mm_mask3_fmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a, b, c, 0b11111111,
        );
        let e = _mm_set_pd(3., -1.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_fnmadd_round_ss() {
        let a = _mm_set1_ps(1.);
        let b = _mm_set1_ps(2.);
        let c = _mm_set1_ps(3.);
        let r = _mm_fnmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
        let e = _mm_set_ps(1., 1., 1., 1.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_fnmadd_round_ss() {
        let a = _mm_set1_ps(1.);
        let b = _mm_set1_ps(2.);
        let c = _mm_set1_ps(3.);
        let r = _mm_mask_fnmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a, 0, b, c,
        );
        assert_eq_m128(r, a);
        let r = _mm_mask_fnmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a, 0b11111111, b, c,
        );
        let e = _mm_set_ps(1., 1., 1., 1.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_fnmadd_round_ss() {
        let a = _mm_set1_ps(1.);
        let b = _mm_set1_ps(2.);
        let c = _mm_set1_ps(3.);
        let r = _mm_maskz_fnmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            0, a, b, c,
        );
        let e = _mm_set_ps(1., 1., 1., 0.);
        assert_eq_m128(r, e);
        let r = _mm_maskz_fnmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            0b11111111, a, b, c,
        );
        let e = _mm_set_ps(1., 1., 1., 1.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask3_fnmadd_round_ss() {
        let a = _mm_set1_ps(1.);
        let b = _mm_set1_ps(2.);
        let c = _mm_set1_ps(3.);
        let r = _mm_mask3_fnmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a, b, c, 0,
        );
        assert_eq_m128(r, c);
        let r = _mm_mask3_fnmadd_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a, b, c, 0b11111111,
        );
        let e = _mm_set_ps(3., 3., 3., 1.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_fnmadd_round_sd() {
        let a = _mm_set1_pd(1.);
        let b = _mm_set1_pd(2.);
        let c = _mm_set1_pd(3.);
        let r = _mm_fnmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
        let e = _mm_set_pd(1., 1.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_fnmadd_round_sd() {
        let a = _mm_set1_pd(1.);
        let b = _mm_set1_pd(2.);
        let c = _mm_set1_pd(3.);
        let r = _mm_mask_fnmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a, 0, b, c,
        );
        assert_eq_m128d(r, a);
        let r = _mm_mask_fnmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a, 0b11111111, b, c,
        );
        let e = _mm_set_pd(1., 1.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_fnmadd_round_sd() {
        let a = _mm_set1_pd(1.);
        let b = _mm_set1_pd(2.);
        let c = _mm_set1_pd(3.);
        let r = _mm_maskz_fnmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            0, a, b, c,
        );
        let e = _mm_set_pd(1., 0.);
        assert_eq_m128d(r, e);
        let r = _mm_maskz_fnmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            0b11111111, a, b, c,
        );
        let e = _mm_set_pd(1., 1.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask3_fnmadd_round_sd() {
        let a = _mm_set1_pd(1.);
        let b = _mm_set1_pd(2.);
        let c = _mm_set1_pd(3.);
        let r = _mm_mask3_fnmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a, b, c, 0,
        );
        assert_eq_m128d(r, c);
        let r = _mm_mask3_fnmadd_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a, b, c, 0b11111111,
        );
        let e = _mm_set_pd(3., 1.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_fnmsub_round_ss() {
        let a = _mm_set1_ps(1.);
        let b = _mm_set1_ps(2.);
        let c = _mm_set1_ps(3.);
        let r = _mm_fnmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
        let e = _mm_set_ps(1., 1., 1., -5.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_fnmsub_round_ss() {
        let a = _mm_set1_ps(1.);
        let b = _mm_set1_ps(2.);
        let c = _mm_set1_ps(3.);
        let r = _mm_mask_fnmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a, 0, b, c,
        );
        assert_eq_m128(r, a);
        let r = _mm_mask_fnmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a, 0b11111111, b, c,
        );
        let e = _mm_set_ps(1., 1., 1., -5.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_fnmsub_round_ss() {
        let a = _mm_set1_ps(1.);
        let b = _mm_set1_ps(2.);
        let c = _mm_set1_ps(3.);
        let r = _mm_maskz_fnmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            0, a, b, c,
        );
        let e = _mm_set_ps(1., 1., 1., 0.);
        assert_eq_m128(r, e);
        let r = _mm_maskz_fnmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            0b11111111, a, b, c,
        );
        let e = _mm_set_ps(1., 1., 1., -5.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask3_fnmsub_round_ss() {
        let a = _mm_set1_ps(1.);
        let b = _mm_set1_ps(2.);
        let c = _mm_set1_ps(3.);
        let r = _mm_mask3_fnmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a, b, c, 0,
        );
        assert_eq_m128(r, c);
        let r = _mm_mask3_fnmsub_round_ss::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a, b, c, 0b11111111,
        );
        let e = _mm_set_ps(3., 3., 3., -5.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_fnmsub_round_sd() {
        let a = _mm_set1_pd(1.);
        let b = _mm_set1_pd(2.);
        let c = _mm_set1_pd(3.);
        let r = _mm_fnmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(a, b, c);
        let e = _mm_set_pd(1., -5.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_fnmsub_round_sd() {
        let a = _mm_set1_pd(1.);
        let b = _mm_set1_pd(2.);
        let c = _mm_set1_pd(3.);
        let r = _mm_mask_fnmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a, 0, b, c,
        );
        assert_eq_m128d(r, a);
        let r = _mm_mask_fnmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a, 0b11111111, b, c,
        );
        let e = _mm_set_pd(1., -5.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_fnmsub_round_sd() {
        let a = _mm_set1_pd(1.);
        let b = _mm_set1_pd(2.);
        let c = _mm_set1_pd(3.);
        let r = _mm_maskz_fnmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            0, a, b, c,
        );
        let e = _mm_set_pd(1., 0.);
        assert_eq_m128d(r, e);
        let r = _mm_maskz_fnmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            0b11111111, a, b, c,
        );
        let e = _mm_set_pd(1., -5.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask3_fnmsub_round_sd() {
        let a = _mm_set1_pd(1.);
        let b = _mm_set1_pd(2.);
        let c = _mm_set1_pd(3.);
        let r = _mm_mask3_fnmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a, b, c, 0,
        );
        assert_eq_m128d(r, c);
        let r = _mm_mask3_fnmsub_round_sd::<{ _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC }>(
            a, b, c, 0b11111111,
        );
        let e = _mm_set_pd(3., -5.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_fixupimm_ss() {
        let a = _mm_set_ps(0., 0., 0., f32::NAN);
        let b = _mm_set1_ps(f32::MAX);
        let c = _mm_set1_epi32(i32::MAX);
        let r = _mm_fixupimm_ss::<5>(a, b, c);
        let e = _mm_set_ps(0., 0., 0., -0.0);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_fixupimm_ss() {
        let a = _mm_set_ps(0., 0., 0., f32::NAN);
        let b = _mm_set1_ps(f32::MAX);
        let c = _mm_set1_epi32(i32::MAX);
        let r = _mm_mask_fixupimm_ss::<5>(a, 0b11111111, b, c);
        let e = _mm_set_ps(0., 0., 0., -0.0);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_fixupimm_ss() {
        let a = _mm_set_ps(0., 0., 0., f32::NAN);
        let b = _mm_set1_ps(f32::MAX);
        let c = _mm_set1_epi32(i32::MAX);
        let r = _mm_maskz_fixupimm_ss::<5>(0b00000000, a, b, c);
        let e = _mm_set_ps(0., 0., 0., 0.0);
        assert_eq_m128(r, e);
        let r = _mm_maskz_fixupimm_ss::<5>(0b11111111, a, b, c);
        let e = _mm_set_ps(0., 0., 0., -0.0);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_fixupimm_sd() {
        let a = _mm_set_pd(0., f64::NAN);
        let b = _mm_set1_pd(f64::MAX);
        let c = _mm_set1_epi64x(i32::MAX as i64);
        let r = _mm_fixupimm_sd::<5>(a, b, c);
        let e = _mm_set_pd(0., -0.0);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_fixupimm_sd() {
        let a = _mm_set_pd(0., f64::NAN);
        let b = _mm_set1_pd(f64::MAX);
        let c = _mm_set1_epi64x(i32::MAX as i64);
        let r = _mm_mask_fixupimm_sd::<5>(a, 0b11111111, b, c);
        let e = _mm_set_pd(0., -0.0);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_fixupimm_sd() {
        let a = _mm_set_pd(0., f64::NAN);
        let b = _mm_set1_pd(f64::MAX);
        let c = _mm_set1_epi64x(i32::MAX as i64);
        let r = _mm_maskz_fixupimm_sd::<5>(0b00000000, a, b, c);
        let e = _mm_set_pd(0., 0.0);
        assert_eq_m128d(r, e);
        let r = _mm_maskz_fixupimm_sd::<5>(0b11111111, a, b, c);
        let e = _mm_set_pd(0., -0.0);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_fixupimm_round_ss() {
        let a = _mm_set_ps(1., 0., 0., f32::NAN);
        let b = _mm_set1_ps(f32::MAX);
        let c = _mm_set1_epi32(i32::MAX);
        let r = _mm_fixupimm_round_ss::<5, _MM_FROUND_CUR_DIRECTION>(a, b, c);
        let e = _mm_set_ps(1., 0., 0., -0.0);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_fixupimm_round_ss() {
        let a = _mm_set_ps(0., 0., 0., f32::NAN);
        let b = _mm_set1_ps(f32::MAX);
        let c = _mm_set1_epi32(i32::MAX);
        let r = _mm_mask_fixupimm_round_ss::<5, _MM_FROUND_CUR_DIRECTION>(a, 0b11111111, b, c);
        let e = _mm_set_ps(0., 0., 0., -0.0);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_fixupimm_round_ss() {
        let a = _mm_set_ps(0., 0., 0., f32::NAN);
        let b = _mm_set1_ps(f32::MAX);
        let c = _mm_set1_epi32(i32::MAX);
        let r = _mm_maskz_fixupimm_round_ss::<5, _MM_FROUND_CUR_DIRECTION>(0b00000000, a, b, c);
        let e = _mm_set_ps(0., 0., 0., 0.0);
        assert_eq_m128(r, e);
        let r = _mm_maskz_fixupimm_round_ss::<5, _MM_FROUND_CUR_DIRECTION>(0b11111111, a, b, c);
        let e = _mm_set_ps(0., 0., 0., -0.0);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_fixupimm_round_sd() {
        let a = _mm_set_pd(0., f64::NAN);
        let b = _mm_set1_pd(f64::MAX);
        let c = _mm_set1_epi64x(i32::MAX as i64);
        let r = _mm_fixupimm_round_sd::<5, _MM_FROUND_CUR_DIRECTION>(a, b, c);
        let e = _mm_set_pd(0., -0.0);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_fixupimm_round_sd() {
        let a = _mm_set_pd(0., f64::NAN);
        let b = _mm_set1_pd(f64::MAX);
        let c = _mm_set1_epi64x(i32::MAX as i64);
        let r = _mm_mask_fixupimm_round_sd::<5, _MM_FROUND_CUR_DIRECTION>(a, 0b11111111, b, c);
        let e = _mm_set_pd(0., -0.0);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_fixupimm_round_sd() {
        let a = _mm_set_pd(0., f64::NAN);
        let b = _mm_set1_pd(f64::MAX);
        let c = _mm_set1_epi64x(i32::MAX as i64);
        let r = _mm_maskz_fixupimm_round_sd::<5, _MM_FROUND_CUR_DIRECTION>(0b00000000, a, b, c);
        let e = _mm_set_pd(0., 0.0);
        assert_eq_m128d(r, e);
        let r = _mm_maskz_fixupimm_round_sd::<5, _MM_FROUND_CUR_DIRECTION>(0b11111111, a, b, c);
        let e = _mm_set_pd(0., -0.0);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_cvtss_sd() {
        let a = _mm_set_pd(6., -7.5);
        let b = _mm_set_ps(0., -0.5, 1., -1.5);
        let r = _mm_mask_cvtss_sd(a, 0, a, b);
        assert_eq_m128d(r, a);
        let r = _mm_mask_cvtss_sd(a, 0b11111111, a, b);
        let e = _mm_set_pd(6., -1.5);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_cvtss_sd() {
        let a = _mm_set_pd(6., -7.5);
        let b = _mm_set_ps(0., -0.5, 1., -1.5);
        let r = _mm_maskz_cvtss_sd(0, a, b);
        let e = _mm_set_pd(6., 0.);
        assert_eq_m128d(r, e);
        let r = _mm_maskz_cvtss_sd(0b11111111, a, b);
        let e = _mm_set_pd(6., -1.5);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_cvtsd_ss() {
        let a = _mm_set_ps(0., -0.5, 1., -1.5);
        let b = _mm_set_pd(6., -7.5);
        let r = _mm_mask_cvtsd_ss(a, 0, a, b);
        assert_eq_m128(r, a);
        let r = _mm_mask_cvtsd_ss(a, 0b11111111, a, b);
        let e = _mm_set_ps(0., -0.5, 1., -7.5);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_cvtsd_ss() {
        let a = _mm_set_ps(0., -0.5, 1., -1.5);
        let b = _mm_set_pd(6., -7.5);
        let r = _mm_maskz_cvtsd_ss(0, a, b);
        let e = _mm_set_ps(0., -0.5, 1., 0.);
        assert_eq_m128(r, e);
        let r = _mm_maskz_cvtsd_ss(0b11111111, a, b);
        let e = _mm_set_ps(0., -0.5, 1., -7.5);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_cvt_roundss_sd() {
        let a = _mm_set_pd(6., -7.5);
        let b = _mm_set_ps(0., -0.5, 1., -1.5);
        let r = _mm_cvt_roundss_sd::<_MM_FROUND_CUR_DIRECTION>(a, b);
        let e = _mm_set_pd(6., -1.5);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_cvt_roundss_sd() {
        let a = _mm_set_pd(6., -7.5);
        let b = _mm_set_ps(0., -0.5, 1., -1.5);
        let r = _mm_mask_cvt_roundss_sd::<_MM_FROUND_CUR_DIRECTION>(a, 0, a, b);
        assert_eq_m128d(r, a);
        let r = _mm_mask_cvt_roundss_sd::<_MM_FROUND_CUR_DIRECTION>(a, 0b11111111, a, b);
        let e = _mm_set_pd(6., -1.5);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_cvt_roundss_sd() {
        let a = _mm_set_pd(6., -7.5);
        let b = _mm_set_ps(0., -0.5, 1., -1.5);
        let r = _mm_maskz_cvt_roundss_sd::<_MM_FROUND_CUR_DIRECTION>(0, a, b);
        let e = _mm_set_pd(6., 0.);
        assert_eq_m128d(r, e);
        let r = _mm_maskz_cvt_roundss_sd::<_MM_FROUND_CUR_DIRECTION>(0b11111111, a, b);
        let e = _mm_set_pd(6., -1.5);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_cvt_roundsd_ss() {
        let a = _mm_set_ps(0., -0.5, 1., -1.5);
        let b = _mm_set_pd(6., -7.5);
        let r = _mm_cvt_roundsd_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
        let e = _mm_set_ps(0., -0.5, 1., -7.5);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_mask_cvt_roundsd_ss() {
        let a = _mm_set_ps(0., -0.5, 1., -1.5);
        let b = _mm_set_pd(6., -7.5);
        let r = _mm_mask_cvt_roundsd_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, 0, a, b);
        assert_eq_m128(r, a);
        let r = _mm_mask_cvt_roundsd_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
            a, 0b11111111, a, b,
        );
        let e = _mm_set_ps(0., -0.5, 1., -7.5);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_maskz_cvt_roundsd_ss() {
        let a = _mm_set_ps(0., -0.5, 1., -1.5);
        let b = _mm_set_pd(6., -7.5);
        let r = _mm_maskz_cvt_roundsd_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(0, a, b);
        let e = _mm_set_ps(0., -0.5, 1., 0.);
        assert_eq_m128(r, e);
        let r = _mm_maskz_cvt_roundsd_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(
            0b11111111, a, b,
        );
        let e = _mm_set_ps(0., -0.5, 1., -7.5);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_cvt_roundss_si32() {
        let a = _mm_set_ps(0., -0.5, 1., -1.5);
        let r = _mm_cvt_roundss_si32::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
        let e: i32 = -1;
        assert_eq!(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_cvt_roundss_i32() {
        let a = _mm_set_ps(0., -0.5, 1., -1.5);
        let r = _mm_cvt_roundss_i32::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
        let e: i32 = -1;
        assert_eq!(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_cvt_roundss_u32() {
        let a = _mm_set_ps(0., -0.5, 1., -1.5);
        let r = _mm_cvt_roundss_u32::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
        let e: u32 = u32::MAX;
        assert_eq!(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_cvtss_i32() {
        let a = _mm_set_ps(0., -0.5, 1., -1.5);
        let r = _mm_cvtss_i32(a);
        let e: i32 = -2;
        assert_eq!(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_cvtss_u32() {
        let a = _mm_set_ps(0., -0.5, 1., -1.5);
        let r = _mm_cvtss_u32(a);
        let e: u32 = u32::MAX;
        assert_eq!(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_cvt_roundsd_si32() {
        let a = _mm_set_pd(1., -1.5);
        let r = _mm_cvt_roundsd_si32::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
        let e: i32 = -1;
        assert_eq!(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_cvt_roundsd_i32() {
        let a = _mm_set_pd(1., -1.5);
        let r = _mm_cvt_roundsd_i32::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
        let e: i32 = -1;
        assert_eq!(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_cvt_roundsd_u32() {
        let a = _mm_set_pd(1., -1.5);
        let r = _mm_cvt_roundsd_u32::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a);
        let e: u32 = u32::MAX;
        assert_eq!(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_cvtsd_i32() {
        let a = _mm_set_pd(1., -1.5);
        let r = _mm_cvtsd_i32(a);
        let e: i32 = -2;
        assert_eq!(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_cvtsd_u32() {
        let a = _mm_set_pd(1., -1.5);
        let r = _mm_cvtsd_u32(a);
        let e: u32 = u32::MAX;
        assert_eq!(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_cvt_roundi32_ss() {
        let a = _mm_set_ps(0., -0.5, 1., -1.5);
        let b: i32 = 9;
        let r = _mm_cvt_roundi32_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
        let e = _mm_set_ps(0., -0.5, 1., 9.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_cvt_roundsi32_ss() {
        let a = _mm_set_ps(0., -0.5, 1., -1.5);
        let b: i32 = 9;
        let r = _mm_cvt_roundsi32_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
        let e = _mm_set_ps(0., -0.5, 1., 9.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_cvt_roundu32_ss() {
        let a = _mm_set_ps(0., -0.5, 1., -1.5);
        let b: u32 = 9;
        let r = _mm_cvt_roundu32_ss::<{ _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC }>(a, b);
        let e = _mm_set_ps(0., -0.5, 1., 9.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_cvti32_ss() {
        let a = _mm_set_ps(0., -0.5, 1., -1.5);
        let b: i32 = 9;
        let r = _mm_cvti32_ss(a, b);
        let e = _mm_set_ps(0., -0.5, 1., 9.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_cvti32_sd() {
        let a = _mm_set_pd(1., -1.5);
        let b: i32 = 9;
        let r = _mm_cvti32_sd(a, b);
        let e = _mm_set_pd(1., 9.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_cvtt_roundss_si32() {
        let a = _mm_set_ps(0., -0.5, 1., -1.5);
        let r = _mm_cvtt_roundss_si32::<_MM_FROUND_CUR_DIRECTION>(a);
        let e: i32 = -2;
        assert_eq!(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_cvtt_roundss_i32() {
        let a = _mm_set_ps(0., -0.5, 1., -1.5);
        let r = _mm_cvtt_roundss_i32::<_MM_FROUND_CUR_DIRECTION>(a);
        let e: i32 = -2;
        assert_eq!(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_cvtt_roundss_u32() {
        let a = _mm_set_ps(0., -0.5, 1., -1.5);
        let r = _mm_cvtt_roundss_u32::<_MM_FROUND_CUR_DIRECTION>(a);
        let e: u32 = u32::MAX;
        assert_eq!(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_cvttss_i32() {
        let a = _mm_set_ps(0., -0.5, 1., -1.5);
        let r = _mm_cvttss_i32(a);
        let e: i32 = -2;
        assert_eq!(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_cvttss_u32() {
        let a = _mm_set_ps(0., -0.5, 1., -1.5);
        let r = _mm_cvttss_u32(a);
        let e: u32 = u32::MAX;
        assert_eq!(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_cvtt_roundsd_si32() {
        let a = _mm_set_pd(1., -1.5);
        let r = _mm_cvtt_roundsd_si32::<_MM_FROUND_CUR_DIRECTION>(a);
        let e: i32 = -2;
        assert_eq!(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_cvtt_roundsd_i32() {
        let a = _mm_set_pd(1., -1.5);
        let r = _mm_cvtt_roundsd_i32::<_MM_FROUND_CUR_DIRECTION>(a);
        let e: i32 = -2;
        assert_eq!(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_cvtt_roundsd_u32() {
        let a = _mm_set_pd(1., -1.5);
        let r = _mm_cvtt_roundsd_u32::<_MM_FROUND_CUR_DIRECTION>(a);
        let e: u32 = u32::MAX;
        assert_eq!(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_cvttsd_i32() {
        let a = _mm_set_pd(1., -1.5);
        let r = _mm_cvttsd_i32(a);
        let e: i32 = -2;
        assert_eq!(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_cvttsd_u32() {
        let a = _mm_set_pd(1., -1.5);
        let r = _mm_cvttsd_u32(a);
        let e: u32 = u32::MAX;
        assert_eq!(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_cvtu32_ss() {
        let a = _mm_set_ps(0., -0.5, 1., -1.5);
        let b: u32 = 9;
        let r = _mm_cvtu32_ss(a, b);
        let e = _mm_set_ps(0., -0.5, 1., 9.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_cvtu32_sd() {
        let a = _mm_set_pd(1., -1.5);
        let b: u32 = 9;
        let r = _mm_cvtu32_sd(a, b);
        let e = _mm_set_pd(1., 9.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_comi_round_ss() {
        let a = _mm_set1_ps(2.2);
        let b = _mm_set1_ps(1.1);
        let r = _mm_comi_round_ss::<0, _MM_FROUND_CUR_DIRECTION>(a, b);
        let e: i32 = 0;
        assert_eq!(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm_comi_round_sd() {
        let a = _mm_set1_pd(2.2);
        let b = _mm_set1_pd(1.1);
        let r = _mm_comi_round_sd::<0, _MM_FROUND_CUR_DIRECTION>(a, b);
        let e: i32 = 0;
        assert_eq!(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_cvtsi512_si32() {
        let a = _mm512_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
        let r = _mm512_cvtsi512_si32(a);
        let e: i32 = 1;
        assert_eq!(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_shuffle_pd() {
        let a = _mm512_setr_pd(1., 4., 5., 8., 1., 4., 5., 8.);
        let b = _mm512_setr_pd(2., 3., 6., 7., 2., 3., 6., 7.);
        let r = _mm512_shuffle_pd::<0b11_11_11_11>(a, b);
        let e = _mm512_setr_pd(4., 3., 8., 7., 4., 3., 8., 7.);
        assert_eq_m512d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_shuffle_pd() {
        let a = _mm512_setr_pd(1., 4., 5., 8., 1., 4., 5., 8.);
        let b = _mm512_setr_pd(2., 3., 6., 7., 2., 3., 6., 7.);
        let r = _mm512_mask_shuffle_pd::<0b11_11_11_11>(a, 0, a, b);
        assert_eq_m512d(r, a);
        let r = _mm512_mask_shuffle_pd::<0b11_11_11_11>(a, 0b11111111, a, b);
        let e = _mm512_setr_pd(4., 3., 8., 7., 4., 3., 8., 7.);
        assert_eq_m512d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_shuffle_pd() {
        let a = _mm512_setr_pd(1., 4., 5., 8., 1., 4., 5., 8.);
        let b = _mm512_setr_pd(2., 3., 6., 7., 2., 3., 6., 7.);
        let r = _mm512_maskz_shuffle_pd::<0b11_11_11_11>(0, a, b);
        assert_eq_m512d(r, _mm512_setzero_pd());
        let r = _mm512_maskz_shuffle_pd::<0b11_11_11_11>(0b00001111, a, b);
        let e = _mm512_setr_pd(4., 3., 8., 7., 0., 0., 0., 0.);
        assert_eq_m512d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_expandloadu_epi32() {
        let src = _mm512_set1_epi32(42);
        let a = &[1_i32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
        let p = a.as_ptr();
        let m = 0b11101000_11001010;
        let r = _mm512_mask_expandloadu_epi32(src, m, black_box(p));
        let e = _mm512_set_epi32(8, 7, 6, 42, 5, 42, 42, 42, 4, 3, 42, 42, 2, 42, 1, 42);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_expandloadu_epi32() {
        let a = &[1_i32, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
        let p = a.as_ptr();
        let m = 0b11101000_11001010;
        let r = _mm512_maskz_expandloadu_epi32(m, black_box(p));
        let e = _mm512_set_epi32(8, 7, 6, 0, 5, 0, 0, 0, 4, 3, 0, 0, 2, 0, 1, 0);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_expandloadu_epi32() {
        let src = _mm256_set1_epi32(42);
        let a = &[1_i32, 2, 3, 4, 5, 6, 7, 8];
        let p = a.as_ptr();
        let m = 0b11101000;
        let r = _mm256_mask_expandloadu_epi32(src, m, black_box(p));
        let e = _mm256_set_epi32(4, 3, 2, 42, 1, 42, 42, 42);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_expandloadu_epi32() {
        let a = &[1_i32, 2, 3, 4, 5, 6, 7, 8];
        let p = a.as_ptr();
        let m = 0b11101000;
        let r = _mm256_maskz_expandloadu_epi32(m, black_box(p));
        let e = _mm256_set_epi32(4, 3, 2, 0, 1, 0, 0, 0);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_expandloadu_epi32() {
        let src = _mm_set1_epi32(42);
        let a = &[1_i32, 2, 3, 4];
        let p = a.as_ptr();
        let m = 0b11111000;
        let r = _mm_mask_expandloadu_epi32(src, m, black_box(p));
        let e = _mm_set_epi32(1, 42, 42, 42);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_expandloadu_epi32() {
        let a = &[1_i32, 2, 3, 4];
        let p = a.as_ptr();
        let m = 0b11111000;
        let r = _mm_maskz_expandloadu_epi32(m, black_box(p));
        let e = _mm_set_epi32(1, 0, 0, 0);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_expandloadu_epi64() {
        let src = _mm512_set1_epi64(42);
        let a = &[1_i64, 2, 3, 4, 5, 6, 7, 8];
        let p = a.as_ptr();
        let m = 0b11101000;
        let r = _mm512_mask_expandloadu_epi64(src, m, black_box(p));
        let e = _mm512_set_epi64(4, 3, 2, 42, 1, 42, 42, 42);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_expandloadu_epi64() {
        let a = &[1_i64, 2, 3, 4, 5, 6, 7, 8];
        let p = a.as_ptr();
        let m = 0b11101000;
        let r = _mm512_maskz_expandloadu_epi64(m, black_box(p));
        let e = _mm512_set_epi64(4, 3, 2, 0, 1, 0, 0, 0);
        assert_eq_m512i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_expandloadu_epi64() {
        let src = _mm256_set1_epi64x(42);
        let a = &[1_i64, 2, 3, 4];
        let p = a.as_ptr();
        let m = 0b11101000;
        let r = _mm256_mask_expandloadu_epi64(src, m, black_box(p));
        let e = _mm256_set_epi64x(1, 42, 42, 42);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_expandloadu_epi64() {
        let a = &[1_i64, 2, 3, 4];
        let p = a.as_ptr();
        let m = 0b11101000;
        let r = _mm256_maskz_expandloadu_epi64(m, black_box(p));
        let e = _mm256_set_epi64x(1, 0, 0, 0);
        assert_eq_m256i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_expandloadu_epi64() {
        let src = _mm_set1_epi64x(42);
        let a = &[1_i64, 2];
        let p = a.as_ptr();
        let m = 0b11101000;
        let r = _mm_mask_expandloadu_epi64(src, m, black_box(p));
        let e = _mm_set_epi64x(42, 42);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_expandloadu_epi64() {
        let a = &[1_i64, 2];
        let p = a.as_ptr();
        let m = 0b11101000;
        let r = _mm_maskz_expandloadu_epi64(m, black_box(p));
        let e = _mm_set_epi64x(0, 0);
        assert_eq_m128i(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_expandloadu_ps() {
        let src = _mm512_set1_ps(42.);
        let a = &[
            1.0f32, 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
        ];
        let p = a.as_ptr();
        let m = 0b11101000_11001010;
        let r = _mm512_mask_expandloadu_ps(src, m, black_box(p));
        let e = _mm512_set_ps(
            8., 7., 6., 42., 5., 42., 42., 42., 4., 3., 42., 42., 2., 42., 1., 42.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_expandloadu_ps() {
        let a = &[
            1.0f32, 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.,
        ];
        let p = a.as_ptr();
        let m = 0b11101000_11001010;
        let r = _mm512_maskz_expandloadu_ps(m, black_box(p));
        let e = _mm512_set_ps(
            8., 7., 6., 0., 5., 0., 0., 0., 4., 3., 0., 0., 2., 0., 1., 0.,
        );
        assert_eq_m512(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_expandloadu_ps() {
        let src = _mm256_set1_ps(42.);
        let a = &[1.0f32, 2., 3., 4., 5., 6., 7., 8.];
        let p = a.as_ptr();
        let m = 0b11101000;
        let r = _mm256_mask_expandloadu_ps(src, m, black_box(p));
        let e = _mm256_set_ps(4., 3., 2., 42., 1., 42., 42., 42.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_expandloadu_ps() {
        let a = &[1.0f32, 2., 3., 4., 5., 6., 7., 8.];
        let p = a.as_ptr();
        let m = 0b11101000;
        let r = _mm256_maskz_expandloadu_ps(m, black_box(p));
        let e = _mm256_set_ps(4., 3., 2., 0., 1., 0., 0., 0.);
        assert_eq_m256(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_expandloadu_ps() {
        let src = _mm_set1_ps(42.);
        let a = &[1.0f32, 2., 3., 4.];
        let p = a.as_ptr();
        let m = 0b11101000;
        let r = _mm_mask_expandloadu_ps(src, m, black_box(p));
        let e = _mm_set_ps(1., 42., 42., 42.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_expandloadu_ps() {
        let a = &[1.0f32, 2., 3., 4.];
        let p = a.as_ptr();
        let m = 0b11101000;
        let r = _mm_maskz_expandloadu_ps(m, black_box(p));
        let e = _mm_set_ps(1., 0., 0., 0.);
        assert_eq_m128(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_mask_expandloadu_pd() {
        let src = _mm512_set1_pd(42.);
        let a = &[1.0f64, 2., 3., 4., 5., 6., 7., 8.];
        let p = a.as_ptr();
        let m = 0b11101000;
        let r = _mm512_mask_expandloadu_pd(src, m, black_box(p));
        let e = _mm512_set_pd(4., 3., 2., 42., 1., 42., 42., 42.);
        assert_eq_m512d(r, e);
    }

    #[simd_test(enable = "avx512f")]
    unsafe fn test_mm512_maskz_expandloadu_pd() {
        let a = &[1.0f64, 2., 3., 4., 5., 6., 7., 8.];
        let p = a.as_ptr();
        let m = 0b11101000;
        let r = _mm512_maskz_expandloadu_pd(m, black_box(p));
        let e = _mm512_set_pd(4., 3., 2., 0., 1., 0., 0., 0.);
        assert_eq_m512d(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_mask_expandloadu_pd() {
        let src = _mm256_set1_pd(42.);
        let a = &[1.0f64, 2., 3., 4.];
        let p = a.as_ptr();
        let m = 0b11101000;
        let r = _mm256_mask_expandloadu_pd(src, m, black_box(p));
        let e = _mm256_set_pd(1., 42., 42., 42.);
        assert_eq_m256d(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm256_maskz_expandloadu_pd() {
        let a = &[1.0f64, 2., 3., 4.];
        let p = a.as_ptr();
        let m = 0b11101000;
        let r = _mm256_maskz_expandloadu_pd(m, black_box(p));
        let e = _mm256_set_pd(1., 0., 0., 0.);
        assert_eq_m256d(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_mask_expandloadu_pd() {
        let src = _mm_set1_pd(42.);
        let a = &[1.0f64, 2.];
        let p = a.as_ptr();
        let m = 0b11101000;
        let r = _mm_mask_expandloadu_pd(src, m, black_box(p));
        let e = _mm_set_pd(42., 42.);
        assert_eq_m128d(r, e);
    }

    #[simd_test(enable = "avx512f,avx512vl")]
    unsafe fn test_mm_maskz_expandloadu_pd() {
        let a = &[1.0f64, 2.];
        let p = a.as_ptr();
        let m = 0b11101000;
        let r = _mm_maskz_expandloadu_pd(m, black_box(p));
        let e = _mm_set_pd(0., 0.);
        assert_eq_m128d(r, e);
    }
}
