[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Minios-devel] [UNIKRAFT/LIBINTEL-INTRINSICS PATCH 2/2] Initial port of Intel Intrinsics on Unikraft



Hi Vlad,

Thanks for the patch. I noticed that on older gcc versions (e.g., v6) the build 
breaks, perhaps you can send an updated patch (as discussed offline) to fix 
this?

Thanks,

-- Felipe

On 03.06.19, 00:15, "Vlad-Andrei BĂDOIU (78692)" 
<vlad_andrei.badoiu@xxxxxxxxxxxxxxx> wrote:

    This is our initial port of intel intrinsics to Unikraft as an external 
library.
    This release is based on the LLVM headers which were adapted to work on 
Unikraft
    with GCC.
    
    Signed-off-by: Vlad-Andrei Badoiu <vlad_andrei.badoiu@xxxxxxxxxxxxxxx>
    ---
     CODING_STYLE.md        |    4 +
     CONTRIBUTING.md        |    4 +
     Config.uk              |   25 +
     MAINTAINERS.md         |   11 +
     Makefile.uk            |   69 +
     README.md              |    5 +
     exportsyms.uk          |    1 +
     include/avxintrin.h    | 5121 ++++++++++++++++++++++++++++++++++++++++
     include/emmintrin.h    | 5021 +++++++++++++++++++++++++++++++++++++++
     include/immintrin.h    |   75 +
     include/mm_malloc.h    |   75 +
     include/mmintrin.h     | 1598 +++++++++++++
     include/nmmintrin.h    |   30 +
     include/pmmintrin.h    |  321 +++
     include/popcntintrin.h |  102 +
     include/smmintrin.h    | 2504 ++++++++++++++++++++
     include/tmmintrin.h    |  790 +++++++
     include/xmmintrin.h    | 3101 ++++++++++++++++++++++++
     18 files changed, 18857 insertions(+)
     create mode 100644 CODING_STYLE.md
     create mode 100644 CONTRIBUTING.md
     create mode 100644 Config.uk
     create mode 100644 MAINTAINERS.md
     create mode 100644 Makefile.uk
     create mode 100644 README.md
     create mode 100644 exportsyms.uk
     create mode 100644 include/avxintrin.h
     create mode 100644 include/emmintrin.h
     create mode 100644 include/immintrin.h
     create mode 100644 include/mm_malloc.h
     create mode 100644 include/mmintrin.h
     create mode 100644 include/nmmintrin.h
     create mode 100644 include/pmmintrin.h
     create mode 100644 include/popcntintrin.h
     create mode 100644 include/smmintrin.h
     create mode 100644 include/tmmintrin.h
     create mode 100644 include/xmmintrin.h
    
    diff --git a/CODING_STYLE.md b/CODING_STYLE.md
    new file mode 100644
    index 0000000..5730041
    --- /dev/null
    +++ b/CODING_STYLE.md
    @@ -0,0 +1,4 @@
    +Coding Style
    +============
    +
    +Please refer to the `CODING_STYLE.md` file in the main Unikraft repository.
    diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
    new file mode 100644
    index 0000000..5f55eca
    --- /dev/null
    +++ b/CONTRIBUTING.md
    @@ -0,0 +1,4 @@
    +Contributing to Unikraft
    +=======================
    +
    +Please refer to the `CONTRIBUTING.md` file in the main Unikraft repository.
    diff --git a/Config.uk b/Config.uk
    new file mode 100644
    index 0000000..413b137
    --- /dev/null
    +++ b/Config.uk
    @@ -0,0 +1,25 @@
    +menuconfig LIBINTEL_INTRINSICS
    +    bool "Intel Intrinsics - C style functions that provide access Intel 
instructions"
    +    default n
    +
    +if LIBINTEL_INTRINSICS
    +config SIMD_SSE
    +    bool "Enable SSE support"
    +    default n
    +
    +config SIMD_SSE2
    +    bool "Enable SSE2 support"
    +    default n
    +
    +config SIMD_SSE3
    +    bool "Enable SSE3 support"
    +    default n
    +
    +config SIMD_SSE4
    +    bool "Enable SSE4 support"
    +    default n
    +
    +config SIMD_AVX
    +    bool "Enable AVX support"
    +    default n
    +endif
    diff --git a/MAINTAINERS.md b/MAINTAINERS.md
    new file mode 100644
    index 0000000..a3d3b0a
    --- /dev/null
    +++ b/MAINTAINERS.md
    @@ -0,0 +1,11 @@
    +Maintainers List
    +================
    +
    +For notes on how to read this information, please refer to 
`MAINTAINERS.md` in
    +the main Unikraft repository.
    +
    +   LIBINTEL_INTRINSICS-UNIKRAFT
    +   M:      Felipe Huici <felipe.huici@xxxxxxxxx>
    +   M:      Vlad-Andrei Badoiu <vlad_andrei.badoiu@xxxxxxxxxxxxxxx>
    +   L:      minios-devel@xxxxxxxxxxxxx
    +   F: *
    diff --git a/Makefile.uk b/Makefile.uk
    new file mode 100644
    index 0000000..585c4f3
    --- /dev/null
    +++ b/Makefile.uk
    @@ -0,0 +1,69 @@
    +#  libintel_intrinsics Makefile.uk
    +#
    +#  Authors: Vlad-Andrei Badoiu <vlad_andrei.badoiu@xxxxxxxxxxxxxxx>
    +#
    +#  Copyright (c) 2019, University Politehnica of Bucharest. All rights 
reserved.
    +#
    +#  Redistribution and use in source and binary forms, with or without
    +#  modification, are permitted provided that the following conditions
    +#  are met:
    +#
    +#  1. Redistributions of source code must retain the above copyright
    +#     notice, this list of conditions and the following disclaimer.
    +#  2. Redistributions in binary form must reproduce the above copyright
    +#     notice, this list of conditions and the following disclaimer in the
    +#     documentation and/or other materials provided with the distribution.
    +#  3. Neither the name of the copyright holder nor the names of its
    +#     contributors may be used to endorse or promote products derived from
    +#     this software without specific prior written permission.
    +#
    +#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS 
IS"
    +#  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
THE
    +#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 
PURPOSE
    +#  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS 
BE
    +#  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
    +#  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
    +#  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
    +#  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
    +#  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
    +#  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 
THE
    +#  POSSIBILITY OF SUCH DAMAGE.
    +#
    +#  THIS HEADER MAY NOT BE EXTRACTED OR MODIFIED IN ANY WAY.
    +#
    +
    
+################################################################################
    +# Library registration
    
+################################################################################
    +$(eval $(call addlib_s,libintel_intrinsics,$(CONFIG_LIBINTEL_INTRINSICS)))
    +
    
+################################################################################
    +# Library includes
    
+################################################################################
    +CINCLUDES-$(CONFIG_LIBINTEL_INTRINSICS) += 
-I$(LIBINTEL_INTRINSICS_BASE)/include
    +CXXINCLUDES-$(CONFIG_LIBINTEL_INTRINSICS) += 
-I$(LIBINTEL_INTRINSICS_BASE)/include
    +
    +ifdef CONFIG_SIMD_SSE
    +CFLAGS             += -msse -mfpmath=sse
    +CXXFLAGS   += -msse -mfpmath=sse
    +endif
    +
    +ifdef CONFIG_SIMD_SSE2
    +CFLAGS             += -msse2
    +CXXFLAGS   += -msse2
    +endif
    +
    +ifdef CONFIG_SIMD_SSE3
    +CFLAGS             += -msse3
    +CXXFLAGS   += -msse3
    +endif
    +
    +ifdef CONFIG_SIMD_SSE4
    +CFLAGS             += -msse4
    +CXXFLAGS   += -msse4
    +endif
    +
    +ifdef CONFIG_SIMD_AVX
    +CFLAGS             += -mavx
    +CXXFLAGS   += -mavx
    +endif
    diff --git a/README.md b/README.md
    new file mode 100644
    index 0000000..ec8c474
    --- /dev/null
    +++ b/README.md
    @@ -0,0 +1,5 @@
    +libintel\_intriniscs for Unikraft
    +===================
    +
    +Please refer to the `README.md` as well as the documentation in the `doc/`
    +subdirectory of the main unikraft repository.
    diff --git a/exportsyms.uk b/exportsyms.uk
    new file mode 100644
    index 0000000..621e94f
    --- /dev/null
    +++ b/exportsyms.uk
    @@ -0,0 +1 @@
    +none
    diff --git a/include/avxintrin.h b/include/avxintrin.h
    new file mode 100644
    index 0000000..6996775
    --- /dev/null
    +++ b/include/avxintrin.h
    @@ -0,0 +1,5121 @@
    +/*===---- avxintrin.h - AVX intrinsics 
-------------------------------------===
    + *
    + * Permission is hereby granted, free of charge, to any person obtaining a 
copy
    + * of this software and associated documentation files (the "Software"), 
to deal
    + * in the Software without restriction, including without limitation the 
rights
    + * to use, copy, modify, merge, publish, distribute, sublicense, and/or 
sell
    + * copies of the Software, and to permit persons to whom the Software is
    + * furnished to do so, subject to the following conditions:
    + *
    + * The above copyright notice and this permission notice shall be included 
in
    + * all copies or substantial portions of the Software.
    + *
    + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 
OR
    + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
THE
    + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
    + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 
IN
    + * THE SOFTWARE.
    + *
    + 
*===-----------------------------------------------------------------------===
    + */
    +
    +#ifndef __IMMINTRIN_H
    +#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
    +#endif
    +
    +#ifndef __AVXINTRIN_H
    +#define __AVXINTRIN_H
    +
    +typedef double __v4df __attribute__ ((__vector_size__ (32)));
    +typedef float __v8sf __attribute__ ((__vector_size__ (32)));
    +typedef long long __v4di __attribute__ ((__vector_size__ (32)));
    +typedef int __v8si __attribute__ ((__vector_size__ (32)));
    +typedef short __v16hi __attribute__ ((__vector_size__ (32)));
    +typedef char __v32qi __attribute__ ((__vector_size__ (32)));
    +
    +/* Unsigned types */
    +typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
    +typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
    +typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
    +typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
    +
    +/* We need an explicitly signed variant for char. Note that this shouldn't
    + * appear in the interface though. */
    +typedef signed char __v32qs __attribute__((__vector_size__(32)));
    +
    +typedef float __m256 __attribute__ ((__vector_size__ (32)));
    +typedef double __m256d __attribute__((__vector_size__(32)));
    +typedef long long __m256i __attribute__((__vector_size__(32)));
    +
    +/* Define the default attributes for the functions in this file. */
    +#ifdef  __GNUC__
    +#define __DEFAULT_FN_ATTRS __attribute__((__gnu_inline__, 
__always_inline__, __artificial__))
    +#define __DEFAULT_FN_ATTRS128 __attribute__((__gnu_inline__, 
__always_inline__, __artificial__))
    +#else
    +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, 
__target__("avx"), __min_vector_width__(256)))
    +#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, 
__nodebug__, __target__("avx"), __min_vector_width__(128)))
    +#endif
    +
    +/* Arithmetic */
    +/// Adds two 256-bit vectors of [4 x double].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VADDPD </c> instruction.
    +///
    +/// \param __a
    +///    A 256-bit vector of [4 x double] containing one of the source 
operands.
    +/// \param __b
    +///    A 256-bit vector of [4 x double] containing one of the source 
operands.
    +/// \returns A 256-bit vector of [4 x double] containing the sums of both
    +///    operands.
    +static __inline __m256d __DEFAULT_FN_ATTRS
    +_mm256_add_pd(__m256d __a, __m256d __b)
    +{
    +  return (__m256d)((__v4df)__a+(__v4df)__b);
    +}
    +
    +/// Adds two 256-bit vectors of [8 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VADDPS </c> instruction.
    +///
    +/// \param __a
    +///    A 256-bit vector of [8 x float] containing one of the source 
operands.
    +/// \param __b
    +///    A 256-bit vector of [8 x float] containing one of the source 
operands.
    +/// \returns A 256-bit vector of [8 x float] containing the sums of both
    +///    operands.
    +static __inline __m256 __DEFAULT_FN_ATTRS
    +_mm256_add_ps(__m256 __a, __m256 __b)
    +{
    +  return (__m256)((__v8sf)__a+(__v8sf)__b);
    +}
    +
    +/// Subtracts two 256-bit vectors of [4 x double].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VSUBPD </c> instruction.
    +///
    +/// \param __a
    +///    A 256-bit vector of [4 x double] containing the minuend.
    +/// \param __b
    +///    A 256-bit vector of [4 x double] containing the subtrahend.
    +/// \returns A 256-bit vector of [4 x double] containing the differences 
between
    +///    both operands.
    +static __inline __m256d __DEFAULT_FN_ATTRS
    +_mm256_sub_pd(__m256d __a, __m256d __b)
    +{
    +  return (__m256d)((__v4df)__a-(__v4df)__b);
    +}
    +
    +/// Subtracts two 256-bit vectors of [8 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VSUBPS </c> instruction.
    +///
    +/// \param __a
    +///    A 256-bit vector of [8 x float] containing the minuend.
    +/// \param __b
    +///    A 256-bit vector of [8 x float] containing the subtrahend.
    +/// \returns A 256-bit vector of [8 x float] containing the differences 
between
    +///    both operands.
    +static __inline __m256 __DEFAULT_FN_ATTRS
    +_mm256_sub_ps(__m256 __a, __m256 __b)
    +{
    +  return (__m256)((__v8sf)__a-(__v8sf)__b);
    +}
    +
    +/// Adds the even-indexed values and subtracts the odd-indexed values of
    +///    two 256-bit vectors of [4 x double].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
    +///
    +/// \param __a
    +///    A 256-bit vector of [4 x double] containing the left source operand.
    +/// \param __b
    +///    A 256-bit vector of [4 x double] containing the right source 
operand.
    +/// \returns A 256-bit vector of [4 x double] containing the alternating 
sums
    +///    and differences between both operands.
    +static __inline __m256d __DEFAULT_FN_ATTRS
    +_mm256_addsub_pd(__m256d __a, __m256d __b)
    +{
    +  return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
    +}
    +
    +/// Adds the even-indexed values and subtracts the odd-indexed values of
    +///    two 256-bit vectors of [8 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
    +///
    +/// \param __a
    +///    A 256-bit vector of [8 x float] containing the left source operand.
    +/// \param __b
    +///    A 256-bit vector of [8 x float] containing the right source operand.
    +/// \returns A 256-bit vector of [8 x float] containing the alternating 
sums and
    +///    differences between both operands.
    +static __inline __m256 __DEFAULT_FN_ATTRS
    +_mm256_addsub_ps(__m256 __a, __m256 __b)
    +{
    +  return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
    +}
    +
    +/// Divides two 256-bit vectors of [4 x double].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VDIVPD </c> instruction.
    +///
    +/// \param __a
    +///    A 256-bit vector of [4 x double] containing the dividend.
    +/// \param __b
    +///    A 256-bit vector of [4 x double] containing the divisor.
    +/// \returns A 256-bit vector of [4 x double] containing the quotients of 
both
    +///    operands.
    +static __inline __m256d __DEFAULT_FN_ATTRS
    +_mm256_div_pd(__m256d __a, __m256d __b)
    +{
    +  return (__m256d)((__v4df)__a/(__v4df)__b);
    +}
    +
    +/// Divides two 256-bit vectors of [8 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VDIVPS </c> instruction.
    +///
    +/// \param __a
    +///    A 256-bit vector of [8 x float] containing the dividend.
    +/// \param __b
    +///    A 256-bit vector of [8 x float] containing the divisor.
    +/// \returns A 256-bit vector of [8 x float] containing the quotients of 
both
    +///    operands.
    +static __inline __m256 __DEFAULT_FN_ATTRS
    +_mm256_div_ps(__m256 __a, __m256 __b)
    +{
    +  return (__m256)((__v8sf)__a/(__v8sf)__b);
    +}
    +
    +/// Compares two 256-bit vectors of [4 x double] and returns the greater
    +///    of each pair of values.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMAXPD </c> instruction.
    +///
    +/// \param __a
    +///    A 256-bit vector of [4 x double] containing one of the operands.
    +/// \param __b
    +///    A 256-bit vector of [4 x double] containing one of the operands.
    +/// \returns A 256-bit vector of [4 x double] containing the maximum values
    +///    between both operands.
    +static __inline __m256d __DEFAULT_FN_ATTRS
    +_mm256_max_pd(__m256d __a, __m256d __b)
    +{
    +  return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
    +}
    +
    +/// Compares two 256-bit vectors of [8 x float] and returns the greater
    +///    of each pair of values.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMAXPS </c> instruction.
    +///
    +/// \param __a
    +///    A 256-bit vector of [8 x float] containing one of the operands.
    +/// \param __b
    +///    A 256-bit vector of [8 x float] containing one of the operands.
    +/// \returns A 256-bit vector of [8 x float] containing the maximum values
    +///    between both operands.
    +static __inline __m256 __DEFAULT_FN_ATTRS
    +_mm256_max_ps(__m256 __a, __m256 __b)
    +{
    +  return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
    +}
    +
    +/// Compares two 256-bit vectors of [4 x double] and returns the lesser
    +///    of each pair of values.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMINPD </c> instruction.
    +///
    +/// \param __a
    +///    A 256-bit vector of [4 x double] containing one of the operands.
    +/// \param __b
    +///    A 256-bit vector of [4 x double] containing one of the operands.
    +/// \returns A 256-bit vector of [4 x double] containing the minimum values
    +///    between both operands.
    +static __inline __m256d __DEFAULT_FN_ATTRS
    +_mm256_min_pd(__m256d __a, __m256d __b)
    +{
    +  return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
    +}
    +
    +/// Compares two 256-bit vectors of [8 x float] and returns the lesser
    +///    of each pair of values.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMINPS </c> instruction.
    +///
    +/// \param __a
    +///    A 256-bit vector of [8 x float] containing one of the operands.
    +/// \param __b
    +///    A 256-bit vector of [8 x float] containing one of the operands.
    +/// \returns A 256-bit vector of [8 x float] containing the minimum values
    +///    between both operands.
    +static __inline __m256 __DEFAULT_FN_ATTRS
    +_mm256_min_ps(__m256 __a, __m256 __b)
    +{
    +  return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
    +}
    +
    +/// Multiplies two 256-bit vectors of [4 x double].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMULPD </c> instruction.
    +///
    +/// \param __a
    +///    A 256-bit vector of [4 x double] containing one of the operands.
    +/// \param __b
    +///    A 256-bit vector of [4 x double] containing one of the operands.
    +/// \returns A 256-bit vector of [4 x double] containing the products of 
both
    +///    operands.
    +static __inline __m256d __DEFAULT_FN_ATTRS
    +_mm256_mul_pd(__m256d __a, __m256d __b)
    +{
    +  return (__m256d)((__v4df)__a * (__v4df)__b);
    +}
    +
    +/// Multiplies two 256-bit vectors of [8 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMULPS </c> instruction.
    +///
    +/// \param __a
    +///    A 256-bit vector of [8 x float] containing one of the operands.
    +/// \param __b
    +///    A 256-bit vector of [8 x float] containing one of the operands.
    +/// \returns A 256-bit vector of [8 x float] containing the products of 
both
    +///    operands.
    +static __inline __m256 __DEFAULT_FN_ATTRS
    +_mm256_mul_ps(__m256 __a, __m256 __b)
    +{
    +  return (__m256)((__v8sf)__a * (__v8sf)__b);
    +}
    +
    +/// Calculates the square roots of the values in a 256-bit vector of
    +///    [4 x double].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VSQRTPD </c> instruction.
    +///
    +/// \param __a
    +///    A 256-bit vector of [4 x double].
    +/// \returns A 256-bit vector of [4 x double] containing the square roots 
of the
    +///    values in the operand.
    +static __inline __m256d __DEFAULT_FN_ATTRS
    +_mm256_sqrt_pd(__m256d __a)
    +{
    +  return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
    +}
    +
    +/// Calculates the square roots of the values in a 256-bit vector of
    +///    [8 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VSQRTPS </c> instruction.
    +///
    +/// \param __a
    +///    A 256-bit vector of [8 x float].
    +/// \returns A 256-bit vector of [8 x float] containing the square roots 
of the
    +///    values in the operand.
    +static __inline __m256 __DEFAULT_FN_ATTRS
    +_mm256_sqrt_ps(__m256 __a)
    +{
    +  return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
    +}
    +
    +/// Calculates the reciprocal square roots of the values in a 256-bit
    +///    vector of [8 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VRSQRTPS </c> instruction.
    +///
    +/// \param __a
    +///    A 256-bit vector of [8 x float].
    +/// \returns A 256-bit vector of [8 x float] containing the reciprocal 
square
    +///    roots of the values in the operand.
    +static __inline __m256 __DEFAULT_FN_ATTRS
    +_mm256_rsqrt_ps(__m256 __a)
    +{
    +  return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
    +}
    +
    +/// Calculates the reciprocals of the values in a 256-bit vector of
    +///    [8 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VRCPPS </c> instruction.
    +///
    +/// \param __a
    +///    A 256-bit vector of [8 x float].
    +/// \returns A 256-bit vector of [8 x float] containing the reciprocals of 
the
    +///    values in the operand.
    +static __inline __m256 __DEFAULT_FN_ATTRS
    +_mm256_rcp_ps(__m256 __a)
    +{
    +  return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
    +}
    +
    +/// Rounds the values in a 256-bit vector of [4 x double] as specified
    +///    by the byte operand. The source values are rounded to integer 
values and
    +///    returned as 64-bit double-precision floating-point values.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m256d _mm256_round_pd(__m256d V, const int M);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
    +///
    +/// \param V
    +///    A 256-bit vector of [4 x double].
    +/// \param M
    +///    An integer value that specifies the rounding operation. \n
    +///    Bits [7:4] are reserved. \n
    +///    Bit [3] is a precision exception value: \n
    +///      0: A normal PE exception is used. \n
    +///      1: The PE field is not updated. \n
    +///    Bit [2] is the rounding control source: \n
    +///      0: Use bits [1:0] of \a M. \n
    +///      1: Use the current MXCSR setting. \n
    +///    Bits [1:0] contain the rounding control definition: \n
    +///      00: Nearest. \n
    +///      01: Downward (toward negative infinity). \n
    +///      10: Upward (toward positive infinity). \n
    +///      11: Truncated.
    +/// \returns A 256-bit vector of [4 x double] containing the rounded 
values.
    +#define _mm256_round_pd(V, M) \
    +    (__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M))
    +
    +/// Rounds the values stored in a 256-bit vector of [8 x float] as
    +///    specified by the byte operand. The source values are rounded to 
integer
    +///    values and returned as floating-point values.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m256 _mm256_round_ps(__m256 V, const int M);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
    +///
    +/// \param V
    +///    A 256-bit vector of [8 x float].
    +/// \param M
    +///    An integer value that specifies the rounding operation. \n
    +///    Bits [7:4] are reserved. \n
    +///    Bit [3] is a precision exception value: \n
    +///      0: A normal PE exception is used. \n
    +///      1: The PE field is not updated. \n
    +///    Bit [2] is the rounding control source: \n
    +///      0: Use bits [1:0] of \a M. \n
    +///      1: Use the current MXCSR setting. \n
    +///    Bits [1:0] contain the rounding control definition: \n
    +///      00: Nearest. \n
    +///      01: Downward (toward negative infinity). \n
    +///      10: Upward (toward positive infinity). \n
    +///      11: Truncated.
    +/// \returns A 256-bit vector of [8 x float] containing the rounded values.
    +#define _mm256_round_ps(V, M) \
    +  (__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M))
    +
    +/// Rounds up the values stored in a 256-bit vector of [4 x double]. The
    +///    source values are rounded up to integer values and returned as 
64-bit
    +///    double-precision floating-point values.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m256d _mm256_ceil_pd(__m256d V);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
    +///
    +/// \param V
    +///    A 256-bit vector of [4 x double].
    +/// \returns A 256-bit vector of [4 x double] containing the rounded up 
values.
    +#define _mm256_ceil_pd(V)  _mm256_round_pd((V), _MM_FROUND_CEIL)
    +
    +/// Rounds down the values stored in a 256-bit vector of [4 x double].
    +///    The source values are rounded down to integer values and returned as
    +///    64-bit double-precision floating-point values.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m256d _mm256_floor_pd(__m256d V);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
    +///
    +/// \param V
    +///    A 256-bit vector of [4 x double].
    +/// \returns A 256-bit vector of [4 x double] containing the rounded down
    +///    values.
    +#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
    +
    +/// Rounds up the values stored in a 256-bit vector of [8 x float]. The
    +///    source values are rounded up to integer values and returned as
    +///    floating-point values.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m256 _mm256_ceil_ps(__m256 V);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
    +///
    +/// \param V
    +///    A 256-bit vector of [8 x float].
    +/// \returns A 256-bit vector of [8 x float] containing the rounded up 
values.
    +#define _mm256_ceil_ps(V)  _mm256_round_ps((V), _MM_FROUND_CEIL)
    +
    +/// Rounds down the values stored in a 256-bit vector of [8 x float]. The
    +///    source values are rounded down to integer values and returned as
    +///    floating-point values.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m256 _mm256_floor_ps(__m256 V);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
    +///
    +/// \param V
    +///    A 256-bit vector of [8 x float].
    +/// \returns A 256-bit vector of [8 x float] containing the rounded down 
values.
    +#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
    +
    +/* Logical */
    +/// Performs a bitwise AND of two 256-bit vectors of [4 x double].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VANDPD </c> instruction.
    +///
    +/// \param __a
    +///    A 256-bit vector of [4 x double] containing one of the source 
operands.
    +/// \param __b
    +///    A 256-bit vector of [4 x double] containing one of the source 
operands.
    +/// \returns A 256-bit vector of [4 x double] containing the bitwise AND 
of the
    +///    values between both operands.
    +static __inline __m256d __DEFAULT_FN_ATTRS
    +_mm256_and_pd(__m256d __a, __m256d __b)
    +{
    +  return (__m256d)((__v4du)__a & (__v4du)__b);
    +}
    +
    +/// Performs a bitwise AND of two 256-bit vectors of [8 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VANDPS </c> instruction.
    +///
    +/// \param __a
    +///    A 256-bit vector of [8 x float] containing one of the source 
operands.
    +/// \param __b
    +///    A 256-bit vector of [8 x float] containing one of the source 
operands.
    +/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of 
the
    +///    values between both operands.
    +static __inline __m256 __DEFAULT_FN_ATTRS
    +_mm256_and_ps(__m256 __a, __m256 __b)
    +{
    +  return (__m256)((__v8su)__a & (__v8su)__b);
    +}
    +
    +/// Performs a bitwise AND of two 256-bit vectors of [4 x double], using
    +///    the one's complement of the values contained in the first source 
operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VANDNPD </c> instruction.
    +///
    +/// \param __a
    +///    A 256-bit vector of [4 x double] containing the left source 
operand. The
    +///    one's complement of this value is used in the bitwise AND.
    +/// \param __b
    +///    A 256-bit vector of [4 x double] containing the right source 
operand.
    +/// \returns A 256-bit vector of [4 x double] containing the bitwise AND 
of the
    +///    values of the second operand and the one's complement of the first
    +///    operand.
    +static __inline __m256d __DEFAULT_FN_ATTRS
    +_mm256_andnot_pd(__m256d __a, __m256d __b)
    +{
    +  return (__m256d)(~(__v4du)__a & (__v4du)__b);
    +}
    +
    +/// Performs a bitwise AND of two 256-bit vectors of [8 x float], using
    +///    the one's complement of the values contained in the first source 
operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VANDNPS </c> instruction.
    +///
    +/// \param __a
    +///    A 256-bit vector of [8 x float] containing the left source operand. 
The
    +///    one's complement of this value is used in the bitwise AND.
    +/// \param __b
    +///    A 256-bit vector of [8 x float] containing the right source operand.
    +/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of 
the
    +///    values of the second operand and the one's complement of the first
    +///    operand.
    +static __inline __m256 __DEFAULT_FN_ATTRS
    +_mm256_andnot_ps(__m256 __a, __m256 __b)
    +{
    +  return (__m256)(~(__v8su)__a & (__v8su)__b);
    +}
    +
    +/// Performs a bitwise OR of two 256-bit vectors of [4 x double].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VORPD </c> instruction.
    +///
    +/// \param __a
    +///    A 256-bit vector of [4 x double] containing one of the source 
operands.
    +/// \param __b
    +///    A 256-bit vector of [4 x double] containing one of the source 
operands.
    +/// \returns A 256-bit vector of [4 x double] containing the bitwise OR of 
the
    +///    values between both operands.
    +static __inline __m256d __DEFAULT_FN_ATTRS
    +_mm256_or_pd(__m256d __a, __m256d __b)
    +{
    +  return (__m256d)((__v4du)__a | (__v4du)__b);
    +}
    +
    +/// Performs a bitwise OR of two 256-bit vectors of [8 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VORPS </c> instruction.
    +///
    +/// \param __a
    +///    A 256-bit vector of [8 x float] containing one of the source 
operands.
    +/// \param __b
    +///    A 256-bit vector of [8 x float] containing one of the source 
operands.
    +/// \returns A 256-bit vector of [8 x float] containing the bitwise OR of 
the
    +///    values between both operands.
    +static __inline __m256 __DEFAULT_FN_ATTRS
    +_mm256_or_ps(__m256 __a, __m256 __b)
    +{
    +  return (__m256)((__v8su)__a | (__v8su)__b);
    +}
    +
    +/// Performs a bitwise XOR of two 256-bit vectors of [4 x double].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VXORPD </c> instruction.
    +///
    +/// \param __a
    +///    A 256-bit vector of [4 x double] containing one of the source 
operands.
    +/// \param __b
    +///    A 256-bit vector of [4 x double] containing one of the source 
operands.
    +/// \returns A 256-bit vector of [4 x double] containing the bitwise XOR 
of the
    +///    values between both operands.
    +static __inline __m256d __DEFAULT_FN_ATTRS
    +_mm256_xor_pd(__m256d __a, __m256d __b)
    +{
    +  return (__m256d)((__v4du)__a ^ (__v4du)__b);
    +}
    +
    +/// Performs a bitwise XOR of two 256-bit vectors of [8 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
    +///
    +/// \param __a
    +///    A 256-bit vector of [8 x float] containing one of the source 
operands.
    +/// \param __b
    +///    A 256-bit vector of [8 x float] containing one of the source 
operands.
    +/// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of 
the
    +///    values between both operands.
    +static __inline __m256 __DEFAULT_FN_ATTRS
    +_mm256_xor_ps(__m256 __a, __m256 __b)
    +{
    +  return (__m256)((__v8su)__a ^ (__v8su)__b);
    +}
    +
    +/* Horizontal arithmetic */
    +/// Horizontally adds the adjacent pairs of values contained in two
    +///    256-bit vectors of [4 x double].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
    +///
    +/// \param __a
    +///    A 256-bit vector of [4 x double] containing one of the source 
operands.
    +///    The horizontal sums of the values are returned in the even-indexed
    +///    elements of a vector of [4 x double].
    +/// \param __b
    +///    A 256-bit vector of [4 x double] containing one of the source 
operands.
    +///    The horizontal sums of the values are returned in the odd-indexed
    +///    elements of a vector of [4 x double].
    +/// \returns A 256-bit vector of [4 x double] containing the horizontal 
sums of
    +///    both operands.
    +static __inline __m256d __DEFAULT_FN_ATTRS
    +_mm256_hadd_pd(__m256d __a, __m256d __b)
    +{
    +  return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
    +}
    +
    +/// Horizontally adds the adjacent pairs of values contained in two
    +///    256-bit vectors of [8 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
    +///
    +/// \param __a
    +///    A 256-bit vector of [8 x float] containing one of the source 
operands.
    +///    The horizontal sums of the values are returned in the elements with
    +///    index 0, 1, 4, 5 of a vector of [8 x float].
    +/// \param __b
    +///    A 256-bit vector of [8 x float] containing one of the source 
operands.
    +///    The horizontal sums of the values are returned in the elements with
    +///    index 2, 3, 6, 7 of a vector of [8 x float].
    +/// \returns A 256-bit vector of [8 x float] containing the horizontal 
sums of
    +///    both operands.
    +static __inline __m256 __DEFAULT_FN_ATTRS
    +_mm256_hadd_ps(__m256 __a, __m256 __b)
    +{
    +  return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
    +}
    +
    +/// Horizontally subtracts the adjacent pairs of values contained in two
    +///    256-bit vectors of [4 x double].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
    +///
    +/// \param __a
    +///    A 256-bit vector of [4 x double] containing one of the source 
operands.
    +///    The horizontal differences between the values are returned in the
    +///    even-indexed elements of a vector of [4 x double].
    +/// \param __b
    +///    A 256-bit vector of [4 x double] containing one of the source 
operands.
    +///    The horizontal differences between the values are returned in the
    +///    odd-indexed elements of a vector of [4 x double].
    +/// \returns A 256-bit vector of [4 x double] containing the horizontal
    +///    differences of both operands.
    +static __inline __m256d __DEFAULT_FN_ATTRS
    +_mm256_hsub_pd(__m256d __a, __m256d __b)
    +{
    +  return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
    +}
    +
    +/// Horizontally subtracts the adjacent pairs of values contained in two
    +///    256-bit vectors of [8 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
    +///
    +/// \param __a
    +///    A 256-bit vector of [8 x float] containing one of the source 
operands.
    +///    The horizontal differences between the values are returned in the
    +///    elements with index 0, 1, 4, 5 of a vector of [8 x float].
    +/// \param __b
    +///    A 256-bit vector of [8 x float] containing one of the source 
operands.
    +///    The horizontal differences between the values are returned in the
    +///    elements with index 2, 3, 6, 7 of a vector of [8 x float].
    +/// \returns A 256-bit vector of [8 x float] containing the horizontal
    +///    differences of both operands.
    +static __inline __m256 __DEFAULT_FN_ATTRS
    +_mm256_hsub_ps(__m256 __a, __m256 __b)
    +{
    +  return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
    +}
    +
    +/* Vector permutations */
    +/// Copies the values in a 128-bit vector of [2 x double] as specified
    +///    by the 128-bit integer vector operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double].
    +/// \param __c
    +///    A 128-bit integer vector operand specifying how the values are to be
    +///    copied. \n
    +///    Bit [1]: \n
    +///      0: Bits [63:0] of the source are copied to bits [63:0] of the 
returned
    +///         vector. \n
    +///      1: Bits [127:64] of the source are copied to bits [63:0] of the
    +///         returned vector. \n
    +///    Bit [65]: \n
    +///      0: Bits [63:0] of the source are copied to bits [127:64] of the
    +///         returned vector. \n
    +///      1: Bits [127:64] of the source are copied to bits [127:64] of the
    +///         returned vector.
    +/// \returns A 128-bit vector of [2 x double] containing the copied values.
    +static __inline __m128d __DEFAULT_FN_ATTRS128
    +_mm_permutevar_pd(__m128d __a, __m128i __c)
    +{
    +  return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
    +}
    +
    +/// Copies the values in a 256-bit vector of [4 x double] as specified
    +///    by the 256-bit integer vector operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
    +///
    +/// \param __a
    +///    A 256-bit vector of [4 x double].
    +/// \param __c
    +///    A 256-bit integer vector operand specifying how the values are to be
    +///    copied. \n
    +///    Bit [1]: \n
    +///      0: Bits [63:0] of the source are copied to bits [63:0] of the 
returned
    +///         vector. \n
    +///      1: Bits [127:64] of the source are copied to bits [63:0] of the
    +///         returned vector. \n
    +///    Bit [65]: \n
    +///      0: Bits [63:0] of the source are copied to bits [127:64] of the
    +///         returned vector. \n
    +///      1: Bits [127:64] of the source are copied to bits [127:64] of the
    +///         returned vector. \n
    +///    Bit [129]: \n
    +///      0: Bits [191:128] of the source are copied to bits [191:128] of 
the
    +///         returned vector. \n
    +///      1: Bits [255:192] of the source are copied to bits [191:128] of 
the
    +///         returned vector. \n
    +///    Bit [193]: \n
    +///      0: Bits [191:128] of the source are copied to bits [255:192] of 
the
    +///         returned vector. \n
    +///      1: Bits [255:192] of the source are copied to bits [255:192] of 
the
    +///    returned vector.
    +/// \returns A 256-bit vector of [4 x double] containing the copied values.
    +static __inline __m256d __DEFAULT_FN_ATTRS
    +_mm256_permutevar_pd(__m256d __a, __m256i __c)
    +{
    +  return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
    +}
    +
    +/// Copies the values stored in a 128-bit vector of [4 x float] as
    +///    specified by the 128-bit integer vector operand.
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float].
    +/// \param __c
    +///    A 128-bit integer vector operand specifying how the values are to be
    +///    copied. \n
    +///    Bits [1:0]: \n
    +///      00: Bits [31:0] of the source are copied to bits [31:0] of the
    +///          returned vector. \n
    +///      01: Bits [63:32] of the source are copied to bits [31:0] of the
    +///          returned vector. \n
    +///      10: Bits [95:64] of the source are copied to bits [31:0] of the
    +///          returned vector. \n
    +///      11: Bits [127:96] of the source are copied to bits [31:0] of the
    +///          returned vector. \n
    +///    Bits [33:32]: \n
    +///      00: Bits [31:0] of the source are copied to bits [63:32] of the
    +///          returned vector. \n
    +///      01: Bits [63:32] of the source are copied to bits [63:32] of the
    +///          returned vector. \n
    +///      10: Bits [95:64] of the source are copied to bits [63:32] of the
    +///          returned vector. \n
    +///      11: Bits [127:96] of the source are copied to bits [63:32] of the
    +///          returned vector. \n
    +///    Bits [65:64]: \n
    +///      00: Bits [31:0] of the source are copied to bits [95:64] of the
    +///          returned vector. \n
    +///      01: Bits [63:32] of the source are copied to bits [95:64] of the
    +///          returned vector. \n
    +///      10: Bits [95:64] of the source are copied to bits [95:64] of the
    +///          returned vector. \n
    +///      11: Bits [127:96] of the source are copied to bits [95:64] of the
    +///          returned vector. \n
    +///    Bits [97:96]: \n
    +///      00: Bits [31:0] of the source are copied to bits [127:96] of the
    +///          returned vector. \n
    +///      01: Bits [63:32] of the source are copied to bits [127:96] of the
    +///          returned vector. \n
    +///      10: Bits [95:64] of the source are copied to bits [127:96] of the
    +///          returned vector. \n
    +///      11: Bits [127:96] of the source are copied to bits [127:96] of the
    +///          returned vector.
    +/// \returns A 128-bit vector of [4 x float] containing the copied values.
    +static __inline __m128 __DEFAULT_FN_ATTRS128
    +_mm_permutevar_ps(__m128 __a, __m128i __c)
    +{
    +  return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
    +}
    +
    +/// Copies the values stored in a 256-bit vector of [8 x float] as
    +///    specified by the 256-bit integer vector operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
    +///
    +/// \param __a
    +///    A 256-bit vector of [8 x float].
    +/// \param __c
    +///    A 256-bit integer vector operand specifying how the values are to be
    +///    copied. \n
    +///    Bits [1:0]: \n
    +///      00: Bits [31:0] of the source are copied to bits [31:0] of the
    +///          returned vector. \n
    +///      01: Bits [63:32] of the source are copied to bits [31:0] of the
    +///          returned vector. \n
    +///      10: Bits [95:64] of the source are copied to bits [31:0] of the
    +///          returned vector. \n
    +///      11: Bits [127:96] of the source are copied to bits [31:0] of the
    +///          returned vector. \n
    +///    Bits [33:32]: \n
    +///      00: Bits [31:0] of the source are copied to bits [63:32] of the
    +///          returned vector. \n
    +///      01: Bits [63:32] of the source are copied to bits [63:32] of the
    +///          returned vector. \n
    +///      10: Bits [95:64] of the source are copied to bits [63:32] of the
    +///          returned vector. \n
    +///      11: Bits [127:96] of the source are copied to bits [63:32] of the
    +///          returned vector. \n
    +///    Bits [65:64]: \n
    +///      00: Bits [31:0] of the source are copied to bits [95:64] of the
    +///          returned vector. \n
    +///      01: Bits [63:32] of the source are copied to bits [95:64] of the
    +///          returned vector. \n
    +///      10: Bits [95:64] of the source are copied to bits [95:64] of the
    +///          returned vector. \n
    +///      11: Bits [127:96] of the source are copied to bits [95:64] of the
    +///          returned vector. \n
    +///    Bits [97:96]: \n
    +///      00: Bits [31:0] of the source are copied to bits [127:96] of the
    +///          returned vector. \n
    +///      01: Bits [63:32] of the source are copied to bits [127:96] of the
    +///          returned vector. \n
    +///      10: Bits [95:64] of the source are copied to bits [127:96] of the
    +///          returned vector. \n
    +///      11: Bits [127:96] of the source are copied to bits [127:96] of the
    +///          returned vector. \n
    +///    Bits [129:128]: \n
    +///      00: Bits [159:128] of the source are copied to bits [159:128] of 
the
    +///          returned vector. \n
    +///      01: Bits [191:160] of the source are copied to bits [159:128] of 
the
    +///          returned vector. \n
    +///      10: Bits [223:192] of the source are copied to bits [159:128] of 
the
    +///          returned vector. \n
    +///      11: Bits [255:224] of the source are copied to bits [159:128] of 
the
    +///          returned vector. \n
    +///    Bits [161:160]: \n
    +///      00: Bits [159:128] of the source are copied to bits [191:160] of 
the
    +///          returned vector. \n
    +///      01: Bits [191:160] of the source are copied to bits [191:160] of 
the
    +///          returned vector. \n
    +///      10: Bits [223:192] of the source are copied to bits [191:160] of 
the
    +///          returned vector. \n
    +///      11: Bits [255:224] of the source are copied to bits [191:160] of 
the
    +///          returned vector. \n
    +///    Bits [193:192]: \n
    +///      00: Bits [159:128] of the source are copied to bits [223:192] of 
the
    +///          returned vector. \n
    +///      01: Bits [191:160] of the source are copied to bits [223:192] of 
the
    +///          returned vector. \n
    +///      10: Bits [223:192] of the source are copied to bits [223:192] of 
the
    +///          returned vector. \n
    +///      11: Bits [255:224] of the source are copied to bits [223:192] of 
the
    +///          returned vector. \n
    +///    Bits [225:224]: \n
    +///      00: Bits [159:128] of the source are copied to bits [255:224] of 
the
    +///          returned vector. \n
    +///      01: Bits [191:160] of the source are copied to bits [255:224] of 
the
    +///          returned vector. \n
    +///      10: Bits [223:192] of the source are copied to bits [255:224] of 
the
    +///          returned vector. \n
    +///      11: Bits [255:224] of the source are copied to bits [255:224] of 
the
    +///          returned vector.
    +/// \returns A 256-bit vector of [8 x float] containing the copied values.
    +static __inline __m256 __DEFAULT_FN_ATTRS
    +_mm256_permutevar_ps(__m256 __a, __m256i __c)
    +{
    +  return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
    +}
    +
    +/// Copies the values in a 128-bit vector of [2 x double] as specified
    +///    by the immediate integer operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m128d _mm_permute_pd(__m128d A, const int C);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
    +///
    +/// \param A
    +///    A 128-bit vector of [2 x double].
    +/// \param C
    +///    An immediate integer operand specifying how the values are to be
    +///    copied. \n
    +///    Bit [0]: \n
    +///      0: Bits [63:0] of the source are copied to bits [63:0] of the 
returned
    +///         vector. \n
    +///      1: Bits [127:64] of the source are copied to bits [63:0] of the
    +///         returned vector. \n
    +///    Bit [1]: \n
    +///      0: Bits [63:0] of the source are copied to bits [127:64] of the
    +///         returned vector. \n
    +///      1: Bits [127:64] of the source are copied to bits [127:64] of the
    +///         returned vector.
    +/// \returns A 128-bit vector of [2 x double] containing the copied values.
    +#define _mm_permute_pd(A, C) \
    +  (__m128d)__builtin_ia32_vpermilpd((__v2df)(__m128d)(A), (int)(C))
    +
    +/// Copies the values in a 256-bit vector of [4 x double] as specified by
    +///    the immediate integer operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m256d _mm256_permute_pd(__m256d A, const int C);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
    +///
    +/// \param A
    +///    A 256-bit vector of [4 x double].
    +/// \param C
    +///    An immediate integer operand specifying how the values are to be
    +///    copied. \n
    +///    Bit [0]: \n
    +///      0: Bits [63:0] of the source are copied to bits [63:0] of the 
returned
    +///         vector. \n
    +///      1: Bits [127:64] of the source are copied to bits [63:0] of the
    +///         returned vector. \n
    +///    Bit [1]: \n
    +///      0: Bits [63:0] of the source are copied to bits [127:64] of the
    +///         returned vector. \n
    +///      1: Bits [127:64] of the source are copied to bits [127:64] of the
    +///         returned vector. \n
    +///    Bit [2]: \n
    +///      0: Bits [191:128] of the source are copied to bits [191:128] of 
the
    +///         returned vector. \n
    +///      1: Bits [255:192] of the source are copied to bits [191:128] of 
the
    +///         returned vector. \n
    +///    Bit [3]: \n
    +///      0: Bits [191:128] of the source are copied to bits [255:192] of 
the
    +///         returned vector. \n
    +///      1: Bits [255:192] of the source are copied to bits [255:192] of 
the
    +///         returned vector.
    +/// \returns A 256-bit vector of [4 x double] containing the copied values.
    +#define _mm256_permute_pd(A, C) \
    +  (__m256d)__builtin_ia32_vpermilpd256((__v4df)(__m256d)(A), (int)(C))
    +
    +/// Copies the values in a 128-bit vector of [4 x float] as specified by
    +///    the immediate integer operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m128 _mm_permute_ps(__m128 A, const int C);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
    +///
    +/// \param A
    +///    A 128-bit vector of [4 x float].
    +/// \param C
    +///    An immediate integer operand specifying how the values are to be
    +///    copied. \n
    +///    Bits [1:0]: \n
    +///      00: Bits [31:0] of the source are copied to bits [31:0] of the
    +///          returned vector. \n
    +///      01: Bits [63:32] of the source are copied to bits [31:0] of the
    +///          returned vector. \n
    +///      10: Bits [95:64] of the source are copied to bits [31:0] of the
    +///          returned vector. \n
    +///      11: Bits [127:96] of the source are copied to bits [31:0] of the
    +///          returned vector. \n
    +///    Bits [3:2]: \n
    +///      00: Bits [31:0] of the source are copied to bits [63:32] of the
    +///          returned vector. \n
    +///      01: Bits [63:32] of the source are copied to bits [63:32] of the
    +///          returned vector. \n
    +///      10: Bits [95:64] of the source are copied to bits [63:32] of the
    +///          returned vector. \n
    +///      11: Bits [127:96] of the source are copied to bits [63:32] of the
    +///          returned vector. \n
    +///    Bits [5:4]: \n
    +///      00: Bits [31:0] of the source are copied to bits [95:64] of the
    +///          returned vector. \n
    +///      01: Bits [63:32] of the source are copied to bits [95:64] of the
    +///          returned vector. \n
    +///      10: Bits [95:64] of the source are copied to bits [95:64] of the
    +///          returned vector. \n
    +///      11: Bits [127:96] of the source are copied to bits [95:64] of the
    +///          returned vector. \n
    +///    Bits [7:6]: \n
    +///      00: Bits [31:0] of the source are copied to bits [127:96] of the
    +///          returned vector. \n
    +///      01: Bits [63:32] of the source are copied to bits [127:96] of the
    +///          returned vector. \n
    +///      10: Bits [95:64] of the source are copied to bits [127:96] of the
    +///          returned vector. \n
    +///      11: Bits [127:96] of the source are copied to bits [127:96] of the
    +///          returned vector.
    +/// \returns A 128-bit vector of [4 x float] containing the copied values.
    +#define _mm_permute_ps(A, C) \
    +  (__m128)__builtin_ia32_vpermilps((__v4sf)(__m128)(A), (int)(C))
    +
    +/// Copies the values in a 256-bit vector of [8 x float] as specified by
    +///    the immediate integer operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m256 _mm256_permute_ps(__m256 A, const int C);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
    +///
    +/// \param A
    +///    A 256-bit vector of [8 x float].
    +/// \param C
    +///    An immediate integer operand specifying how the values are to be
    +///    copied. \n
    +///    Bits [1:0]: \n
    +///      00: Bits [31:0] of the source are copied to bits [31:0] of the
    +///          returned vector. \n
    +///      01: Bits [63:32] of the source are copied to bits [31:0] of the
    +///          returned vector. \n
    +///      10: Bits [95:64] of the source are copied to bits [31:0] of the
    +///          returned vector. \n
    +///      11: Bits [127:96] of the source are copied to bits [31:0] of the
    +///          returned vector. \n
    +///    Bits [3:2]: \n
    +///      00: Bits [31:0] of the source are copied to bits [63:32] of the
    +///          returned vector. \n
    +///      01: Bits [63:32] of the source are copied to bits [63:32] of the
    +///          returned vector. \n
    +///      10: Bits [95:64] of the source are copied to bits [63:32] of the
    +///          returned vector. \n
    +///      11: Bits [127:96] of the source are copied to bits [63:32] of the
    +///          returned vector. \n
    +///    Bits [5:4]: \n
    +///      00: Bits [31:0] of the source are copied to bits [95:64] of the
    +///          returned vector. \n
    +///      01: Bits [63:32] of the source are copied to bits [95:64] of the
    +///          returned vector. \n
    +///      10: Bits [95:64] of the source are copied to bits [95:64] of the
    +///          returned vector. \n
    +///      11: Bits [127:96] of the source are copied to bits [95:64] of the
    +///          returned vector. \n
    +///    Bits [7:6]: \n
    +///      00: Bits [31:0] of the source are copied to bits [127:96] of the
    +///          returned vector. \n
    +///      01: Bits [63:32] of the source are copied to bits [127:96] of the
    +///          returned vector. \n
    +///      10: Bits [95:64] of the source are copied to bits [127:96] of the
    +///          returned vector. \n
    +///      11: Bits [127:96] of the source are copied to bits [127:96] of the
    +///          returned vector. \n
    +///    Bits [1:0]: \n
    +///      00: Bits [159:128] of the source are copied to bits [159:128] of 
the
    +///          returned vector. \n
    +///      01: Bits [191:160] of the source are copied to bits [159:128] of 
the
    +///          returned vector. \n
    +///      10: Bits [223:192] of the source are copied to bits [159:128] of 
the
    +///          returned vector. \n
    +///      11: Bits [255:224] of the source are copied to bits [159:128] of 
the
    +///          returned vector. \n
    +///    Bits [3:2]: \n
    +///      00: Bits [159:128] of the source are copied to bits [191:160] of 
the
    +///          returned vector. \n
    +///      01: Bits [191:160] of the source are copied to bits [191:160] of 
the
    +///          returned vector. \n
    +///      10: Bits [223:192] of the source are copied to bits [191:160] of 
the
    +///          returned vector. \n
    +///      11: Bits [255:224] of the source are copied to bits [191:160] of 
the
    +///          returned vector. \n
    +///    Bits [5:4]: \n
    +///      00: Bits [159:128] of the source are copied to bits [223:192] of 
the
    +///          returned vector. \n
    +///      01: Bits [191:160] of the source are copied to bits [223:192] of 
the
    +///          returned vector. \n
    +///      10: Bits [223:192] of the source are copied to bits [223:192] of 
the
    +///          returned vector. \n
    +///      11: Bits [255:224] of the source are copied to bits [223:192] of 
the
    +///          returned vector. \n
    +///    Bits [7:6]: \n
    +///      00: Bits [159:128] of the source are copied to bits [255:224] of 
the
    +///          returned vector. \n
    +///      01: Bits [191:160] of the source are copied to bits [255:224] of 
the
    +///          returned vector. \n
    +///      10: Bits [223:192] of the source are copied to bits [255:224] of 
the
    +///          returned vector. \n
    +///      11: Bits [255:224] of the source are copied to bits [255:224] of 
the
    +///          returned vector.
    +/// \returns A 256-bit vector of [8 x float] containing the copied values.
    +#define _mm256_permute_ps(A, C) \
    +  (__m256)__builtin_ia32_vpermilps256((__v8sf)(__m256)(A), (int)(C))
    +
    +/// Permutes 128-bit data values stored in two 256-bit vectors of
    +///    [4 x double], as specified by the immediate integer operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
    +///
    +/// \param V1
    +///    A 256-bit vector of [4 x double].
    +/// \param V2
    +///    A 256-bit vector of [4 x double.
    +/// \param M
    +///    An immediate integer operand specifying how the values are to be
    +///    permuted. \n
    +///    Bits [1:0]: \n
    +///      00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of 
the
    +///          destination. \n
    +///      01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of 
the
    +///          destination. \n
    +///      10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of 
the
    +///          destination. \n
    +///      11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of 
the
    +///          destination. \n
    +///    Bits [5:4]: \n
    +///      00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of 
the
    +///          destination. \n
    +///      01: Bits [255:128] of operand \a V1 are copied to bits [255:128] 
of the
    +///          destination. \n
    +///      10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of 
the
    +///          destination. \n
    +///      11: Bits [255:128] of operand \a V2 are copied to bits [255:128] 
of the
    +///          destination.
    +/// \returns A 256-bit vector of [4 x double] containing the copied values.
    +#define _mm256_permute2f128_pd(V1, V2, M) \
    +  (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
    +                                           (__v4df)(__m256d)(V2), (int)(M))
    +
    +/// Permutes 128-bit data values stored in two 256-bit vectors of
    +///    [8 x float], as specified by the immediate integer operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
    +///
    +/// \param V1
    +///    A 256-bit vector of [8 x float].
    +/// \param V2
    +///    A 256-bit vector of [8 x float].
    +/// \param M
    +///    An immediate integer operand specifying how the values are to be
    +///    permuted. \n
    +///    Bits [1:0]: \n
    +///    00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
    +///    destination. \n
    +///    01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of 
the
    +///    destination. \n
    +///    10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
    +///    destination. \n
    +///    11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of 
the
    +///    destination. \n
    +///    Bits [5:4]: \n
    +///    00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of 
the
    +///    destination. \n
    +///    01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of 
the
    +///    destination. \n
    +///    10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of 
the
    +///    destination. \n
    +///    11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of 
the
    +///    destination.
    +/// \returns A 256-bit vector of [8 x float] containing the copied values.
    +#define _mm256_permute2f128_ps(V1, V2, M) \
    +  (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
    +                                          (__v8sf)(__m256)(V2), (int)(M))
    +
    +/// Permutes 128-bit data values stored in two 256-bit integer vectors,
    +///    as specified by the immediate integer operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
    +///
    +/// \param V1
    +///    A 256-bit integer vector.
    +/// \param V2
    +///    A 256-bit integer vector.
    +/// \param M
    +///    An immediate integer operand specifying how the values are to be 
copied.
    +///    Bits [1:0]: \n
    +///    00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
    +///    destination. \n
    +///    01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of 
the
    +///    destination. \n
    +///    10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
    +///    destination. \n
    +///    11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of 
the
    +///    destination. \n
    +///    Bits [5:4]: \n
    +///    00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of 
the
    +///    destination. \n
    +///    01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of 
the
    +///    destination. \n
    +///    10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of 
the
    +///    destination. \n
    +///    11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of 
the
    +///    destination.
    +/// \returns A 256-bit integer vector containing the copied values.
    +#define _mm256_permute2f128_si256(V1, V2, M) \
    +  (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
    +                                           (__v8si)(__m256i)(V2), (int)(M))
    +
    +/* Vector Blend */
    +/// Merges 64-bit double-precision data values stored in either of the
    +///    two 256-bit vectors of [4 x double], as specified by the immediate
    +///    integer operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VBLENDPD </c> instruction.
    +///
    +/// \param V1
    +///    A 256-bit vector of [4 x double].
    +/// \param V2
    +///    A 256-bit vector of [4 x double].
    +/// \param M
    +///    An immediate integer operand, with mask bits [3:0] specifying how 
the
    +///    values are to be copied. The position of the mask bit corresponds 
to the
    +///    index of a copied value. When a mask bit is 0, the corresponding 
64-bit
    +///    element in operand \a V1 is copied to the same position in the
    +///    destination. When a mask bit is 1, the corresponding 64-bit element 
in
    +///    operand \a V2 is copied to the same position in the destination.
    +/// \returns A 256-bit vector of [4 x double] containing the copied values.
    +#define _mm256_blend_pd(V1, V2, M) \
    +  (__m256d)__builtin_ia32_blendpd256((__v4df)(__m256d)(V1), \
    +                                     (__v4df)(__m256d)(V2), (int)(M))
    +
    +/// Merges 32-bit single-precision data values stored in either of the
    +///    two 256-bit vectors of [8 x float], as specified by the immediate
    +///    integer operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VBLENDPS </c> instruction.
    +///
    +/// \param V1
    +///    A 256-bit vector of [8 x float].
    +/// \param V2
    +///    A 256-bit vector of [8 x float].
    +/// \param M
    +///    An immediate integer operand, with mask bits [7:0] specifying how 
the
    +///    values are to be copied. The position of the mask bit corresponds 
to the
    +///    index of a copied value. When a mask bit is 0, the corresponding 
32-bit
    +///    element in operand \a V1 is copied to the same position in the
    +///    destination. When a mask bit is 1, the corresponding 32-bit element 
in
    +///    operand \a V2 is copied to the same position in the destination.
    +/// \returns A 256-bit vector of [8 x float] containing the copied values.
    +#define _mm256_blend_ps(V1, V2, M) \
    +  (__m256)__builtin_ia32_blendps256((__v8sf)(__m256)(V1), \
    +                                    (__v8sf)(__m256)(V2), (int)(M))
    +
    +/// Merges 64-bit double-precision data values stored in either of the
    +///    two 256-bit vectors of [4 x double], as specified by the 256-bit 
vector
    +///    operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VBLENDVPD </c> instruction.
    +///
    +/// \param __a
    +///    A 256-bit vector of [4 x double].
    +/// \param __b
    +///    A 256-bit vector of [4 x double].
    +/// \param __c
    +///    A 256-bit vector operand, with mask bits 255, 191, 127, and 63 
specifying
    +///    how the values are to be copied. The position of the mask bit 
corresponds
    +///    to the most significant bit of a copied value. When a mask bit is 
0, the
    +///    corresponding 64-bit element in operand \a __a is copied to the same
    +///    position in the destination. When a mask bit is 1, the corresponding
    +///    64-bit element in operand \a __b is copied to the same position in 
the
    +///    destination.
    +/// \returns A 256-bit vector of [4 x double] containing the copied values.
    +static __inline __m256d __DEFAULT_FN_ATTRS
    +_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
    +{
    +  return (__m256d)__builtin_ia32_blendvpd256(
    +    (__v4df)__a, (__v4df)__b, (__v4df)__c);
    +}
    +
    +/// Merges 32-bit single-precision data values stored in either of the
    +///    two 256-bit vectors of [8 x float], as specified by the 256-bit 
vector
    +///    operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VBLENDVPS </c> instruction.
    +///
    +/// \param __a
    +///    A 256-bit vector of [8 x float].
    +/// \param __b
    +///    A 256-bit vector of [8 x float].
    +/// \param __c
    +///    A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 
95, 63,
    +///    and 31 specifying how the values are to be copied. The position of 
the
    +///    mask bit corresponds to the most significant bit of a copied value. 
When
    +///    a mask bit is 0, the corresponding 32-bit element in operand \a __a 
is
    +///    copied to the same position in the destination. When a mask bit is 
1, the
    +///    corresponding 32-bit element in operand \a __b is copied to the same
    +///    position in the destination.
    +/// \returns A 256-bit vector of [8 x float] containing the copied values.
    +static __inline __m256 __DEFAULT_FN_ATTRS
    +_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
    +{
    +  return (__m256)__builtin_ia32_blendvps256(
    +    (__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
    +}
    +
    +/* Vector Dot Product */
    +/// Computes two dot products in parallel, using the lower and upper
    +///    halves of two [8 x float] vectors as input to the two computations, 
and
    +///    returning the two dot products in the lower and upper halves of the
    +///    [8 x float] result.
    +///
    +///    The immediate integer operand controls which input elements will
    +///    contribute to the dot product, and where the final results are 
returned.
    +///    In general, for each dot product, the four corresponding elements 
of the
    +///    input vectors are multiplied; the first two and second two products 
are
    +///    summed, then the two sums are added to form the final result.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VDPPS </c> instruction.
    +///
    +/// \param V1
    +///    A vector of [8 x float] values, treated as two [4 x float] vectors.
    +/// \param V2
    +///    A vector of [8 x float] values, treated as two [4 x float] vectors.
    +/// \param M
    +///    An immediate integer argument. Bits [7:4] determine which elements 
of
    +///    the input vectors are used, with bit [4] corresponding to the lowest
    +///    element and bit [7] corresponding to the highest element of each [4 
x
    +///    float] subvector. If a bit is set, the corresponding elements from 
the
    +///    two input vectors are used as an input for dot product; otherwise 
that
    +///    input is treated as zero. Bits [3:0] determine which elements of the
    +///    result will receive a copy of the final dot product, with bit [0]
    +///    corresponding to the lowest element and bit [3] corresponding to the
    +///    highest element of each [4 x float] subvector. If a bit is set, the 
dot
    +///    product is returned in the corresponding element; otherwise that 
element
    +///    is set to zero. The bitmask is applied in the same way to each of 
the
    +///    two parallel dot product computations.
    +/// \returns A 256-bit vector of [8 x float] containing the two dot 
products.
    +#define _mm256_dp_ps(V1, V2, M) \
    +  (__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
    +                                 (__v8sf)(__m256)(V2), (M))
    +
    +/* Vector shuffle */
    +/// Selects 8 float values from the 256-bit operands of [8 x float], as
    +///    specified by the immediate value operand.
    +///
    +///    The four selected elements in each operand are copied to the 
destination
    +///    according to the bits specified in the immediate operand. The 
selected
    +///    elements from the first 256-bit operand are copied to bits [63:0] 
and
    +///    bits [191:128] of the destination, and the selected elements from 
the
    +///    second 256-bit operand are copied to bits [127:64] and bits 
[255:192] of
    +///    the destination. For example, if bits [7:0] of the immediate operand
    +///    contain a value of 0xFF, the 256-bit destination vector would 
contain the
    +///    following values: b[7], b[7], a[7], a[7], b[3], b[3], a[3], a[3].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VSHUFPS </c> instruction.
    +///
    +/// \param a
    +///    A 256-bit vector of [8 x float]. The four selected elements in this
    +///    operand are copied to bits [63:0] and bits [191:128] in the 
destination,
    +///    according to the bits specified in the immediate operand.
    +/// \param b
    +///    A 256-bit vector of [8 x float]. The four selected elements in this
    +///    operand are copied to bits [127:64] and bits [255:192] in the
    +///    destination, according to the bits specified in the immediate 
operand.
    +/// \param mask
    +///    An immediate value containing an 8-bit value specifying which 
elements to
    +///    copy from \a a and \a b \n.
    +///    Bits [3:0] specify the values copied from operand \a a. \n
    +///    Bits [7:4] specify the values copied from operand \a b. \n
    +///    The destinations within the 256-bit destination are assigned values 
as
    +///    follows, according to the bit value assignments described below: \n
    +///    Bits [1:0] are used to assign values to bits [31:0] and [159:128] 
in the
    +///    destination. \n
    +///    Bits [3:2] are used to assign values to bits [63:32] and [191:160] 
in the
    +///    destination. \n
    +///    Bits [5:4] are used to assign values to bits [95:64] and [223:192] 
in the
    +///    destination. \n
    +///    Bits [7:6] are used to assign values to bits [127:96] and [255:224] 
in
    +///    the destination. \n
    +///    Bit value assignments: \n
    +///    00: Bits [31:0] and [159:128] are copied from the selected operand. 
\n
    +///    01: Bits [63:32] and [191:160] are copied from the selected 
operand. \n
    +///    10: Bits [95:64] and [223:192] are copied from the selected 
operand. \n
    +///    11: Bits [127:96] and [255:224] are copied from the selected 
operand.
    +/// \returns A 256-bit vector of [8 x float] containing the shuffled 
values.
    +#define _mm256_shuffle_ps(a, b, mask) \
    +  (__m256)__builtin_ia32_shufps256((__v8sf)(__m256)(a), \
    +                                   (__v8sf)(__m256)(b), (int)(mask))
    +
    +/// Selects four double-precision values from the 256-bit operands of
    +///    [4 x double], as specified by the immediate value operand.
    +///
    +///    The selected elements from the first 256-bit operand are copied to 
bits
    +///    [63:0] and bits [191:128] in the destination, and the selected 
elements
    +///    from the second 256-bit operand are copied to bits [127:64] and bits
    +///    [255:192] in the destination. For example, if bits [3:0] of the 
immediate
    +///    operand contain a value of 0xF, the 256-bit destination vector would
    +///    contain the following values: b[3], a[3], b[1], a[1].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VSHUFPD </c> instruction.
    +///
    +/// \param a
    +///    A 256-bit vector of [4 x double].
    +/// \param b
    +///    A 256-bit vector of [4 x double].
    +/// \param mask
    +///    An immediate value containing 8-bit values specifying which 
elements to
    +///    copy from \a a and \a b: \n
    +///    Bit [0]=0: Bits [63:0] are copied from \a a to bits [63:0] of the
    +///    destination. \n
    +///    Bit [0]=1: Bits [127:64] are copied from \a a to bits [63:0] of the
    +///    destination. \n
    +///    Bit [1]=0: Bits [63:0] are copied from \a b to bits [127:64] of the
    +///    destination. \n
    +///    Bit [1]=1: Bits [127:64] are copied from \a b to bits [127:64] of 
the
    +///    destination. \n
    +///    Bit [2]=0: Bits [191:128] are copied from \a a to bits [191:128] of 
the
    +///    destination. \n
    +///    Bit [2]=1: Bits [255:192] are copied from \a a to bits [191:128] of 
the
    +///    destination. \n
    +///    Bit [3]=0: Bits [191:128] are copied from \a b to bits [255:192] of 
the
    +///    destination. \n
    +///    Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of 
the
    +///    destination.
    +/// \returns A 256-bit vector of [4 x double] containing the shuffled 
values.
    +#define _mm256_shuffle_pd(a, b, mask) \
    +  (__m256d)__builtin_ia32_shufpd256((__v4df)(__m256d)(a), \
    +                                    (__v4df)(__m256d)(b), (int)(mask))
    +
    +/* Compare */
    +#define _CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
    +#define _CMP_LT_OS    0x01 /* Less-than (ordered, signaling)  */
    +#define _CMP_LE_OS    0x02 /* Less-than-or-equal (ordered, signaling)  */
    +#define _CMP_UNORD_Q  0x03 /* Unordered (non-signaling)  */
    +#define _CMP_NEQ_UQ   0x04 /* Not-equal (unordered, non-signaling)  */
    +#define _CMP_NLT_US   0x05 /* Not-less-than (unordered, signaling)  */
    +#define _CMP_NLE_US   0x06 /* Not-less-than-or-equal (unordered, 
signaling)  */
    +#define _CMP_ORD_Q    0x07 /* Ordered (non-signaling)   */
    +#define _CMP_EQ_UQ    0x08 /* Equal (unordered, non-signaling)  */
    +#define _CMP_NGE_US   0x09 /* Not-greater-than-or-equal (unordered, 
signaling)  */
    +#define _CMP_NGT_US   0x0a /* Not-greater-than (unordered, signaling)  */
    +#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling)  */
    +#define _CMP_NEQ_OQ   0x0c /* Not-equal (ordered, non-signaling)  */
    +#define _CMP_GE_OS    0x0d /* Greater-than-or-equal (ordered, signaling)  
*/
    +#define _CMP_GT_OS    0x0e /* Greater-than (ordered, signaling)  */
    +#define _CMP_TRUE_UQ  0x0f /* True (unordered, non-signaling)  */
    +#define _CMP_EQ_OS    0x10 /* Equal (ordered, signaling)  */
    +#define _CMP_LT_OQ    0x11 /* Less-than (ordered, non-signaling)  */
    +#define _CMP_LE_OQ    0x12 /* Less-than-or-equal (ordered, non-signaling)  
*/
    +#define _CMP_UNORD_S  0x13 /* Unordered (signaling)  */
    +#define _CMP_NEQ_US   0x14 /* Not-equal (unordered, signaling)  */
    +#define _CMP_NLT_UQ   0x15 /* Not-less-than (unordered, non-signaling)  */
    +#define _CMP_NLE_UQ   0x16 /* Not-less-than-or-equal (unordered, 
non-signaling)  */
    +#define _CMP_ORD_S    0x17 /* Ordered (signaling)  */
    +#define _CMP_EQ_US    0x18 /* Equal (unordered, signaling)  */
    +#define _CMP_NGE_UQ   0x19 /* Not-greater-than-or-equal (unordered, 
non-signaling)  */
    +#define _CMP_NGT_UQ   0x1a /* Not-greater-than (unordered, non-signaling)  
*/
    +#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling)  */
    +#define _CMP_NEQ_OS   0x1c /* Not-equal (ordered, signaling)  */
    +#define _CMP_GE_OQ    0x1d /* Greater-than-or-equal (ordered, 
non-signaling)  */
    +#define _CMP_GT_OQ    0x1e /* Greater-than (ordered, non-signaling)  */
    +#define _CMP_TRUE_US  0x1f /* True (unordered, signaling)  */
    +
    +/// Compares each of the corresponding double-precision values of two
    +///    128-bit vectors of [2 x double], using the operation specified by 
the
    +///    immediate integer operand.
    +///
    +///    Returns a [2 x double] vector consisting of two doubles 
corresponding to
    +///    the two comparison results: zero if the comparison is false, and 
all 1's
    +///    if the comparison is true.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
    +///
    +/// \param a
    +///    A 128-bit vector of [2 x double].
    +/// \param b
    +///    A 128-bit vector of [2 x double].
    +/// \param c
    +///    An immediate integer operand, with bits [4:0] specifying which 
comparison
    +///    operation to use: \n
    +///    0x00: Equal (ordered, non-signaling) \n
    +///    0x01: Less-than (ordered, signaling) \n
    +///    0x02: Less-than-or-equal (ordered, signaling) \n
    +///    0x03: Unordered (non-signaling) \n
    +///    0x04: Not-equal (unordered, non-signaling) \n
    +///    0x05: Not-less-than (unordered, signaling) \n
    +///    0x06: Not-less-than-or-equal (unordered, signaling) \n
    +///    0x07: Ordered (non-signaling) \n
    +///    0x08: Equal (unordered, non-signaling) \n
    +///    0x09: Not-greater-than-or-equal (unordered, signaling) \n
    +///    0x0A: Not-greater-than (unordered, signaling) \n
    +///    0x0B: False (ordered, non-signaling) \n
    +///    0x0C: Not-equal (ordered, non-signaling) \n
    +///    0x0D: Greater-than-or-equal (ordered, signaling) \n
    +///    0x0E: Greater-than (ordered, signaling) \n
    +///    0x0F: True (unordered, non-signaling) \n
    +///    0x10: Equal (ordered, signaling) \n
    +///    0x11: Less-than (ordered, non-signaling) \n
    +///    0x12: Less-than-or-equal (ordered, non-signaling) \n
    +///    0x13: Unordered (signaling) \n
    +///    0x14: Not-equal (unordered, signaling) \n
    +///    0x15: Not-less-than (unordered, non-signaling) \n
    +///    0x16: Not-less-than-or-equal (unordered, non-signaling) \n
    +///    0x17: Ordered (signaling) \n
    +///    0x18: Equal (unordered, signaling) \n
    +///    0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
    +///    0x1A: Not-greater-than (unordered, non-signaling) \n
    +///    0x1B: False (ordered, signaling) \n
    +///    0x1C: Not-equal (ordered, signaling) \n
    +///    0x1D: Greater-than-or-equal (ordered, non-signaling) \n
    +///    0x1E: Greater-than (ordered, non-signaling) \n
    +///    0x1F: True (unordered, signaling)
    +/// \returns A 128-bit vector of [2 x double] containing the comparison 
results.
    +#define _mm_cmp_pd(a, b, c) \
    +  (__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
    +                                (__v2df)(__m128d)(b), (c))
    +
    +/// Compares each of the corresponding values of two 128-bit vectors of
    +///    [4 x float], using the operation specified by the immediate integer
    +///    operand.
    +///
    +///    Returns a [4 x float] vector consisting of four floats 
corresponding to
    +///    the four comparison results: zero if the comparison is false, and 
all 1's
    +///    if the comparison is true.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
    +///
    +/// \param a
    +///    A 128-bit vector of [4 x float].
    +/// \param b
    +///    A 128-bit vector of [4 x float].
    +/// \param c
    +///    An immediate integer operand, with bits [4:0] specifying which 
comparison
    +///    operation to use: \n
    +///    0x00: Equal (ordered, non-signaling) \n
    +///    0x01: Less-than (ordered, signaling) \n
    +///    0x02: Less-than-or-equal (ordered, signaling) \n
    +///    0x03: Unordered (non-signaling) \n
    +///    0x04: Not-equal (unordered, non-signaling) \n
    +///    0x05: Not-less-than (unordered, signaling) \n
    +///    0x06: Not-less-than-or-equal (unordered, signaling) \n
    +///    0x07: Ordered (non-signaling) \n
    +///    0x08: Equal (unordered, non-signaling) \n
    +///    0x09: Not-greater-than-or-equal (unordered, signaling) \n
    +///    0x0A: Not-greater-than (unordered, signaling) \n
    +///    0x0B: False (ordered, non-signaling) \n
    +///    0x0C: Not-equal (ordered, non-signaling) \n
    +///    0x0D: Greater-than-or-equal (ordered, signaling) \n
    +///    0x0E: Greater-than (ordered, signaling) \n
    +///    0x0F: True (unordered, non-signaling) \n
    +///    0x10: Equal (ordered, signaling) \n
    +///    0x11: Less-than (ordered, non-signaling) \n
    +///    0x12: Less-than-or-equal (ordered, non-signaling) \n
    +///    0x13: Unordered (signaling) \n
    +///    0x14: Not-equal (unordered, signaling) \n
    +///    0x15: Not-less-than (unordered, non-signaling) \n
    +///    0x16: Not-less-than-or-equal (unordered, non-signaling) \n
    +///    0x17: Ordered (signaling) \n
    +///    0x18: Equal (unordered, signaling) \n
    +///    0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
    +///    0x1A: Not-greater-than (unordered, non-signaling) \n
    +///    0x1B: False (ordered, signaling) \n
    +///    0x1C: Not-equal (ordered, signaling) \n
    +///    0x1D: Greater-than-or-equal (ordered, non-signaling) \n
    +///    0x1E: Greater-than (ordered, non-signaling) \n
    +///    0x1F: True (unordered, signaling)
    +/// \returns A 128-bit vector of [4 x float] containing the comparison 
results.
    +#define _mm_cmp_ps(a, b, c) \
    +  (__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
    +                               (__v4sf)(__m128)(b), (c))
    +
    +/// Compares each of the corresponding double-precision values of two
    +///    256-bit vectors of [4 x double], using the operation specified by 
the
    +///    immediate integer operand.
    +///
    +///    Returns a [4 x double] vector consisting of four doubles 
corresponding to
    +///    the four comparison results: zero if the comparison is false, and 
all 1's
    +///    if the comparison is true.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
    +///
    +/// \param a
    +///    A 256-bit vector of [4 x double].
    +/// \param b
    +///    A 256-bit vector of [4 x double].
    +/// \param c
    +///    An immediate integer operand, with bits [4:0] specifying which 
comparison
    +///    operation to use: \n
    +///    0x00: Equal (ordered, non-signaling) \n
    +///    0x01: Less-than (ordered, signaling) \n
    +///    0x02: Less-than-or-equal (ordered, signaling) \n
    +///    0x03: Unordered (non-signaling) \n
    +///    0x04: Not-equal (unordered, non-signaling) \n
    +///    0x05: Not-less-than (unordered, signaling) \n
    +///    0x06: Not-less-than-or-equal (unordered, signaling) \n
    +///    0x07: Ordered (non-signaling) \n
    +///    0x08: Equal (unordered, non-signaling) \n
    +///    0x09: Not-greater-than-or-equal (unordered, signaling) \n
    +///    0x0A: Not-greater-than (unordered, signaling) \n
    +///    0x0B: False (ordered, non-signaling) \n
    +///    0x0C: Not-equal (ordered, non-signaling) \n
    +///    0x0D: Greater-than-or-equal (ordered, signaling) \n
    +///    0x0E: Greater-than (ordered, signaling) \n
    +///    0x0F: True (unordered, non-signaling) \n
    +///    0x10: Equal (ordered, signaling) \n
    +///    0x11: Less-than (ordered, non-signaling) \n
    +///    0x12: Less-than-or-equal (ordered, non-signaling) \n
    +///    0x13: Unordered (signaling) \n
    +///    0x14: Not-equal (unordered, signaling) \n
    +///    0x15: Not-less-than (unordered, non-signaling) \n
    +///    0x16: Not-less-than-or-equal (unordered, non-signaling) \n
    +///    0x17: Ordered (signaling) \n
    +///    0x18: Equal (unordered, signaling) \n
    +///    0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
    +///    0x1A: Not-greater-than (unordered, non-signaling) \n
    +///    0x1B: False (ordered, signaling) \n
    +///    0x1C: Not-equal (ordered, signaling) \n
    +///    0x1D: Greater-than-or-equal (ordered, non-signaling) \n
    +///    0x1E: Greater-than (ordered, non-signaling) \n
    +///    0x1F: True (unordered, signaling)
    +/// \returns A 256-bit vector of [4 x double] containing the comparison 
results.
    +#define _mm256_cmp_pd(a, b, c) \
    +  (__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
    +                                   (__v4df)(__m256d)(b), (c))
    +
    +/// Compares each of the corresponding values of two 256-bit vectors of
    +///    [8 x float], using the operation specified by the immediate integer
    +///    operand.
    +///
    +///    Returns a [8 x float] vector consisting of eight floats 
corresponding to
    +///    the eight comparison results: zero if the comparison is false, and 
all
    +///    1's if the comparison is true.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
    +///
    +/// \param a
    +///    A 256-bit vector of [8 x float].
    +/// \param b
    +///    A 256-bit vector of [8 x float].
    +/// \param c
    +///    An immediate integer operand, with bits [4:0] specifying which 
comparison
    +///    operation to use: \n
    +///    0x00: Equal (ordered, non-signaling) \n
    +///    0x01: Less-than (ordered, signaling) \n
    +///    0x02: Less-than-or-equal (ordered, signaling) \n
    +///    0x03: Unordered (non-signaling) \n
    +///    0x04: Not-equal (unordered, non-signaling) \n
    +///    0x05: Not-less-than (unordered, signaling) \n
    +///    0x06: Not-less-than-or-equal (unordered, signaling) \n
    +///    0x07: Ordered (non-signaling) \n
    +///    0x08: Equal (unordered, non-signaling) \n
    +///    0x09: Not-greater-than-or-equal (unordered, signaling) \n
    +///    0x0A: Not-greater-than (unordered, signaling) \n
    +///    0x0B: False (ordered, non-signaling) \n
    +///    0x0C: Not-equal (ordered, non-signaling) \n
    +///    0x0D: Greater-than-or-equal (ordered, signaling) \n
    +///    0x0E: Greater-than (ordered, signaling) \n
    +///    0x0F: True (unordered, non-signaling) \n
    +///    0x10: Equal (ordered, signaling) \n
    +///    0x11: Less-than (ordered, non-signaling) \n
    +///    0x12: Less-than-or-equal (ordered, non-signaling) \n
    +///    0x13: Unordered (signaling) \n
    +///    0x14: Not-equal (unordered, signaling) \n
    +///    0x15: Not-less-than (unordered, non-signaling) \n
    +///    0x16: Not-less-than-or-equal (unordered, non-signaling) \n
    +///    0x17: Ordered (signaling) \n
    +///    0x18: Equal (unordered, signaling) \n
    +///    0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
    +///    0x1A: Not-greater-than (unordered, non-signaling) \n
    +///    0x1B: False (ordered, signaling) \n
    +///    0x1C: Not-equal (ordered, signaling) \n
    +///    0x1D: Greater-than-or-equal (ordered, non-signaling) \n
    +///    0x1E: Greater-than (ordered, non-signaling) \n
    +///    0x1F: True (unordered, signaling)
    +/// \returns A 256-bit vector of [8 x float] containing the comparison 
results.
    +#define _mm256_cmp_ps(a, b, c) \
    +  (__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
    +                                  (__v8sf)(__m256)(b), (c))
    +
    +/// Compares each of the corresponding scalar double-precision values of
    +///    two 128-bit vectors of [2 x double], using the operation specified 
by the
    +///    immediate integer operand.
    +///
    +///    If the result is true, all 64 bits of the destination vector are 
set;
    +///    otherwise they are cleared.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VCMPSD </c> instruction.
    +///
    +/// \param a
    +///    A 128-bit vector of [2 x double].
    +/// \param b
    +///    A 128-bit vector of [2 x double].
    +/// \param c
    +///    An immediate integer operand, with bits [4:0] specifying which 
comparison
    +///    operation to use: \n
    +///    0x00: Equal (ordered, non-signaling) \n
    +///    0x01: Less-than (ordered, signaling) \n
    +///    0x02: Less-than-or-equal (ordered, signaling) \n
    +///    0x03: Unordered (non-signaling) \n
    +///    0x04: Not-equal (unordered, non-signaling) \n
    +///    0x05: Not-less-than (unordered, signaling) \n
    +///    0x06: Not-less-than-or-equal (unordered, signaling) \n
    +///    0x07: Ordered (non-signaling) \n
    +///    0x08: Equal (unordered, non-signaling) \n
    +///    0x09: Not-greater-than-or-equal (unordered, signaling) \n
    +///    0x0A: Not-greater-than (unordered, signaling) \n
    +///    0x0B: False (ordered, non-signaling) \n
    +///    0x0C: Not-equal (ordered, non-signaling) \n
    +///    0x0D: Greater-than-or-equal (ordered, signaling) \n
    +///    0x0E: Greater-than (ordered, signaling) \n
    +///    0x0F: True (unordered, non-signaling) \n
    +///    0x10: Equal (ordered, signaling) \n
    +///    0x11: Less-than (ordered, non-signaling) \n
    +///    0x12: Less-than-or-equal (ordered, non-signaling) \n
    +///    0x13: Unordered (signaling) \n
    +///    0x14: Not-equal (unordered, signaling) \n
    +///    0x15: Not-less-than (unordered, non-signaling) \n
    +///    0x16: Not-less-than-or-equal (unordered, non-signaling) \n
    +///    0x17: Ordered (signaling) \n
    +///    0x18: Equal (unordered, signaling) \n
    +///    0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
    +///    0x1A: Not-greater-than (unordered, non-signaling) \n
    +///    0x1B: False (ordered, signaling) \n
    +///    0x1C: Not-equal (ordered, signaling) \n
    +///    0x1D: Greater-than-or-equal (ordered, non-signaling) \n
    +///    0x1E: Greater-than (ordered, non-signaling) \n
    +///    0x1F: True (unordered, signaling)
    +/// \returns A 128-bit vector of [2 x double] containing the comparison 
results.
    +#define _mm_cmp_sd(a, b, c) \
    +  (__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
    +                                (__v2df)(__m128d)(b), (c))
    +
    +/// Compares each of the corresponding scalar values of two 128-bit
    +///    vectors of [4 x float], using the operation specified by the 
immediate
    +///    integer operand.
    +///
    +///    If the result is true, all 32 bits of the destination vector are 
set;
    +///    otherwise they are cleared.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VCMPSS </c> instruction.
    +///
    +/// \param a
    +///    A 128-bit vector of [4 x float].
    +/// \param b
    +///    A 128-bit vector of [4 x float].
    +/// \param c
    +///    An immediate integer operand, with bits [4:0] specifying which 
comparison
    +///    operation to use: \n
    +///    0x00: Equal (ordered, non-signaling) \n
    +///    0x01: Less-than (ordered, signaling) \n
    +///    0x02: Less-than-or-equal (ordered, signaling) \n
    +///    0x03: Unordered (non-signaling) \n
    +///    0x04: Not-equal (unordered, non-signaling) \n
    +///    0x05: Not-less-than (unordered, signaling) \n
    +///    0x06: Not-less-than-or-equal (unordered, signaling) \n
    +///    0x07: Ordered (non-signaling) \n
    +///    0x08: Equal (unordered, non-signaling) \n
    +///    0x09: Not-greater-than-or-equal (unordered, signaling) \n
    +///    0x0A: Not-greater-than (unordered, signaling) \n
    +///    0x0B: False (ordered, non-signaling) \n
    +///    0x0C: Not-equal (ordered, non-signaling) \n
    +///    0x0D: Greater-than-or-equal (ordered, signaling) \n
    +///    0x0E: Greater-than (ordered, signaling) \n
    +///    0x0F: True (unordered, non-signaling) \n
    +///    0x10: Equal (ordered, signaling) \n
    +///    0x11: Less-than (ordered, non-signaling) \n
    +///    0x12: Less-than-or-equal (ordered, non-signaling) \n
    +///    0x13: Unordered (signaling) \n
    +///    0x14: Not-equal (unordered, signaling) \n
    +///    0x15: Not-less-than (unordered, non-signaling) \n
    +///    0x16: Not-less-than-or-equal (unordered, non-signaling) \n
    +///    0x17: Ordered (signaling) \n
    +///    0x18: Equal (unordered, signaling) \n
    +///    0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
    +///    0x1A: Not-greater-than (unordered, non-signaling) \n
    +///    0x1B: False (ordered, signaling) \n
    +///    0x1C: Not-equal (ordered, signaling) \n
    +///    0x1D: Greater-than-or-equal (ordered, non-signaling) \n
    +///    0x1E: Greater-than (ordered, non-signaling) \n
    +///    0x1F: True (unordered, signaling)
    +/// \returns A 128-bit vector of [4 x float] containing the comparison 
results.
    +#define _mm_cmp_ss(a, b, c) \
    +  (__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
    +                               (__v4sf)(__m128)(b), (c))
    +
    +/// Takes a [8 x i32] vector and returns the vector element value
    +///    indexed by the immediate constant operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
    +///   instruction.
    +///
    +/// \param __a
    +///    A 256-bit vector of [8 x i32].
    +/// \param __imm
    +///    An immediate integer operand with bits [2:0] determining which 
vector
    +///    element is extracted and returned.
    +/// \returns A 32-bit integer containing the extracted 32 bits of extended
    +///    packed data.
    +#define _mm256_extract_epi32(X, N) \
    +  (int)__builtin_ia32_vec_ext_v8si((__v8si)(__m256i)(X), (int)(N))
    +
    +/// Takes a [16 x i16] vector and returns the vector element value
    +///    indexed by the immediate constant operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
    +///   instruction.
    +///
    +/// \param __a
    +///    A 256-bit integer vector of [16 x i16].
    +/// \param __imm
    +///    An immediate integer operand with bits [3:0] determining which 
vector
    +///    element is extracted and returned.
    +/// \returns A 32-bit integer containing the extracted 16 bits of zero 
extended
    +///    packed data.
    +#define _mm256_extract_epi16(X, N) \
    +  (int)(unsigned short)__builtin_ia32_vec_ext_v16hi((__v16hi)(__m256i)(X), 
\
    +                                                    (int)(N))
    +
    +/// Takes a [32 x i8] vector and returns the vector element value
    +///    indexed by the immediate constant operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
    +///   instruction.
    +///
    +/// \param __a
    +///    A 256-bit integer vector of [32 x i8].
    +/// \param __imm
    +///    An immediate integer operand with bits [4:0] determining which 
vector
    +///    element is extracted and returned.
    +/// \returns A 32-bit integer containing the extracted 8 bits of zero 
extended
    +///    packed data.
    +#define _mm256_extract_epi8(X, N) \
    +  (int)(unsigned char)__builtin_ia32_vec_ext_v32qi((__v32qi)(__m256i)(X), \
    +                                                   (int)(N))
    +
    +#ifdef __x86_64__
    +/// Takes a [4 x i64] vector and returns the vector element value
    +///    indexed by the immediate constant operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
    +///   instruction.
    +///
    +/// \param __a
    +///    A 256-bit integer vector of [4 x i64].
    +/// \param __imm
    +///    An immediate integer operand with bits [1:0] determining which 
vector
    +///    element is extracted and returned.
    +/// \returns A 64-bit integer containing the extracted 64 bits of extended
    +///    packed data.
    +#define _mm256_extract_epi64(X, N) \
    +  (long long)__builtin_ia32_vec_ext_v4di((__v4di)(__m256i)(X), (int)(N))
    +#endif
    +
    +/// Takes a [8 x i32] vector and replaces the vector element value
    +///    indexed by the immediate constant operand by a new value. Returns 
the
    +///    modified vector.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
    +///   instruction.
    +///
    +/// \param __a
    +///    A vector of [8 x i32] to be used by the insert operation.
    +/// \param __b
    +///    An integer value. The replacement value for the insert operation.
    +/// \param __imm
    +///    An immediate integer specifying the index of the vector element to 
be
    +///    replaced.
    +/// \returns A copy of vector \a __a, after replacing its element indexed 
by
    +///    \a __imm with \a __b.
    +#define _mm256_insert_epi32(X, I, N) \
    +  (__m256i)__builtin_ia32_vec_set_v8si((__v8si)(__m256i)(X), \
    +                                       (int)(I), (int)(N))
    +
    +
    +/// Takes a [16 x i16] vector and replaces the vector element value
    +///    indexed by the immediate constant operand with a new value. Returns 
the
    +///    modified vector.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
    +///   instruction.
    +///
    +/// \param __a
    +///    A vector of [16 x i16] to be used by the insert operation.
    +/// \param __b
    +///    An i16 integer value. The replacement value for the insert 
operation.
    +/// \param __imm
    +///    An immediate integer specifying the index of the vector element to 
be
    +///    replaced.
    +/// \returns A copy of vector \a __a, after replacing its element indexed 
by
    +///    \a __imm with \a __b.
    +#define _mm256_insert_epi16(X, I, N) \
    +  (__m256i)__builtin_ia32_vec_set_v16hi((__v16hi)(__m256i)(X), \
    +                                        (int)(I), (int)(N))
    +
    +/// Takes a [32 x i8] vector and replaces the vector element value
    +///    indexed by the immediate constant operand with a new value. Returns 
the
    +///    modified vector.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
    +///   instruction.
    +///
    +/// \param __a
    +///    A vector of [32 x i8] to be used by the insert operation.
    +/// \param __b
    +///    An i8 integer value. The replacement value for the insert operation.
    +/// \param __imm
    +///    An immediate integer specifying the index of the vector element to 
be
    +///    replaced.
    +/// \returns A copy of vector \a __a, after replacing its element indexed 
by
    +///    \a __imm with \a __b.
    +#define _mm256_insert_epi8(X, I, N) \
    +  (__m256i)__builtin_ia32_vec_set_v32qi((__v32qi)(__m256i)(X), \
    +                                        (int)(I), (int)(N))
    +
    +#ifdef __x86_64__
    +/// Takes a [4 x i64] vector and replaces the vector element value
    +///    indexed by the immediate constant operand with a new value. Returns 
the
    +///    modified vector.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
    +///   instruction.
    +///
    +/// \param __a
    +///    A vector of [4 x i64] to be used by the insert operation.
    +/// \param __b
    +///    A 64-bit integer value. The replacement value for the insert 
operation.
    +/// \param __imm
    +///    An immediate integer specifying the index of the vector element to 
be
    +///    replaced.
    +/// \returns A copy of vector \a __a, after replacing its element indexed 
by
    +///     \a __imm with \a __b.
    +#define _mm256_insert_epi64(X, I, N) \
    +  (__m256i)__builtin_ia32_vec_set_v4di((__v4di)(__m256i)(X), \
    +                                       (long long)(I), (int)(N))
    +#endif
    +
    +/* Conversion */
    +/// Converts a vector of [4 x i32] into a vector of [4 x double].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCVTDQ2PD </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit integer vector of [4 x i32].
    +/// \returns A 256-bit vector of [4 x double] containing the converted 
values.
    +static __inline __m256d __DEFAULT_FN_ATTRS
    +_mm256_cvtepi32_pd(__m128i __a)
    +{
    +  return (__m256d)__builtin_convertvector((__v4si)__a, __v4df);
    +}
    +
    +/// Converts a vector of [8 x i32] into a vector of [8 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCVTDQ2PS </c> instruction.
    +///
    +/// \param __a
    +///    A 256-bit integer vector.
    +/// \returns A 256-bit vector of [8 x float] containing the converted 
values.
    +static __inline __m256 __DEFAULT_FN_ATTRS
    +_mm256_cvtepi32_ps(__m256i __a)
    +{
    +  return (__m256)__builtin_convertvector((__v8si)__a, __v8sf);
    +}
    +
    +/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of
    +///    [4 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCVTPD2PS </c> instruction.
    +///
    +/// \param __a
    +///    A 256-bit vector of [4 x double].
    +/// \returns A 128-bit vector of [4 x float] containing the converted 
values.
    +static __inline __m128 __DEFAULT_FN_ATTRS
    +_mm256_cvtpd_ps(__m256d __a)
    +{
    +  return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
    +}
    +
    +/// Converts a vector of [8 x float] into a vector of [8 x i32].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCVTPS2DQ </c> instruction.
    +///
    +/// \param __a
    +///    A 256-bit vector of [8 x float].
    +/// \returns A 256-bit integer vector containing the converted values.
    +static __inline __m256i __DEFAULT_FN_ATTRS
    +_mm256_cvtps_epi32(__m256 __a)
    +{
    +  return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
    +}
    +
    +/// Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4
    +///    x double].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCVTPS2PD </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float].
    +/// \returns A 256-bit vector of [4 x double] containing the converted 
values.
    +static __inline __m256d __DEFAULT_FN_ATTRS
    +_mm256_cvtps_pd(__m128 __a)
    +{
    +  return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df);
    +}
    +
    +/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
    +///    x i32], truncating the result by rounding towards zero when it is
    +///    inexact.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCVTTPD2DQ </c> instruction.
    +///
    +/// \param __a
    +///    A 256-bit vector of [4 x double].
    +/// \returns A 128-bit integer vector containing the converted values.
    +static __inline __m128i __DEFAULT_FN_ATTRS
    +_mm256_cvttpd_epi32(__m256d __a)
    +{
    +  return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
    +}
    +
    +/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
    +///    x i32]. When a conversion is inexact, the value returned is rounded
    +///    according to the rounding control bits in the MXCSR register.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCVTPD2DQ </c> instruction.
    +///
    +/// \param __a
    +///    A 256-bit vector of [4 x double].
    +/// \returns A 128-bit integer vector containing the converted values.
    +static __inline __m128i __DEFAULT_FN_ATTRS
    +_mm256_cvtpd_epi32(__m256d __a)
    +{
    +  return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
    +}
    +
    +/// Converts a vector of [8 x float] into a vector of [8 x i32],
    +///    truncating the result by rounding towards zero when it is inexact.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCVTTPS2DQ </c> instruction.
    +///
    +/// \param __a
    +///    A 256-bit vector of [8 x float].
    +/// \returns A 256-bit integer vector containing the converted values.
    +static __inline __m256i __DEFAULT_FN_ATTRS
    +_mm256_cvttps_epi32(__m256 __a)
    +{
    +  return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
    +}
    +
    +/// Returns the first element of the input vector of [4 x double].
    +///
    +/// \headerfile <avxintrin.h>
    +///
    +/// This intrinsic is a utility function and does not correspond to a 
specific
    +///    instruction.
    +///
    +/// \param __a
    +///    A 256-bit vector of [4 x double].
    +/// \returns A 64 bit double containing the first element of the input 
vector.
    +static __inline double __DEFAULT_FN_ATTRS
    +_mm256_cvtsd_f64(__m256d __a)
    +{
    + return __a[0];
    +}
    +
    +/// Returns the first element of the input vector of [8 x i32].
    +///
    +/// \headerfile <avxintrin.h>
    +///
    +/// This intrinsic is a utility function and does not correspond to a 
specific
    +///    instruction.
    +///
    +/// \param __a
    +///    A 256-bit vector of [8 x i32].
    +/// \returns A 32 bit integer containing the first element of the input 
vector.
    +static __inline int __DEFAULT_FN_ATTRS
    +_mm256_cvtsi256_si32(__m256i __a)
    +{
    + __v8si __b = (__v8si)__a;
    + return __b[0];
    +}
    +
    +/// Returns the first element of the input vector of [8 x float].
    +///
    +/// \headerfile <avxintrin.h>
    +///
    +/// This intrinsic is a utility function and does not correspond to a 
specific
    +///    instruction.
    +///
    +/// \param __a
    +///    A 256-bit vector of [8 x float].
    +/// \returns A 32 bit float containing the first element of the input 
vector.
    +static __inline float __DEFAULT_FN_ATTRS
    +_mm256_cvtss_f32(__m256 __a)
    +{
    + return __a[0];
    +}
    +
    +/* Vector replicate */
    +/// Moves and duplicates odd-indexed values from a 256-bit vector of
    +///    [8 x float] to float values in a 256-bit vector of [8 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
    +///
    +/// \param __a
    +///    A 256-bit vector of [8 x float]. \n
    +///    Bits [255:224] of \a __a are written to bits [255:224] and 
[223:192] of
    +///    the return value. \n
    +///    Bits [191:160] of \a __a are written to bits [191:160] and 
[159:128] of
    +///    the return value. \n
    +///    Bits [127:96] of \a __a are written to bits [127:96] and [95:64] of 
the
    +///    return value. \n
    +///    Bits [63:32] of \a __a are written to bits [63:32] and [31:0] of the
    +///    return value.
    +/// \returns A 256-bit vector of [8 x float] containing the moved and 
duplicated
    +///    values.
    +static __inline __m256 __DEFAULT_FN_ATTRS
    +_mm256_movehdup_ps(__m256 __a)
    +{
    +#ifdef __GNUC__
    +  return (__m256) __builtin_ia32_movshdup256 ((__v8sf)__a);
    +#else
    +  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 
5, 7, 7);
    +#endif
    +}
    +
    +/// Moves and duplicates even-indexed values from a 256-bit vector of
    +///    [8 x float] to float values in a 256-bit vector of [8 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
    +///
    +/// \param __a
    +///    A 256-bit vector of [8 x float]. \n
    +///    Bits [223:192] of \a __a are written to bits [255:224] and 
[223:192] of
    +///    the return value. \n
    +///    Bits [159:128] of \a __a are written to bits [191:160] and 
[159:128] of
    +///    the return value. \n
    +///    Bits [95:64] of \a __a are written to bits [127:96] and [95:64] of 
the
    +///    return value. \n
    +///    Bits [31:0] of \a __a are written to bits [63:32] and [31:0] of the
    +///    return value.
    +/// \returns A 256-bit vector of [8 x float] containing the moved and 
duplicated
    +///    values.
    +static __inline __m256 __DEFAULT_FN_ATTRS
    +_mm256_moveldup_ps(__m256 __a)
    +{
    +#ifdef __GNUC__
    +  return (__m256) __builtin_ia32_movsldup256 ((__v8sf)__a);
    +#else
    +  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 
4, 6, 6);
    +#endif
    +}
    +
    +/// Moves and duplicates double-precision floating point values from a
    +///    256-bit vector of [4 x double] to double-precision values in a 
256-bit
    +///    vector of [4 x double].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
    +///
    +/// \param __a
    +///    A 256-bit vector of [4 x double]. \n
    +///    Bits [63:0] of \a __a are written to bits [127:64] and [63:0] of the
    +///    return value. \n
    +///    Bits [191:128] of \a __a are written to bits [255:192] and 
[191:128] of
    +///    the return value.
    +/// \returns A 256-bit vector of [4 x double] containing the moved and
    +///    duplicated values.
    +static __inline __m256d __DEFAULT_FN_ATTRS
    +_mm256_movedup_pd(__m256d __a)
    +{
    +#ifdef __GNUC__
    +  return (__m256d) __builtin_ia32_unpckhpd256 ((__v4df)__a, (__v4df)__a);
    +#else
    +  return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2);
    +#endif
    +}
    +
    +/* Unpack and Interleave */
    +/// Unpacks the odd-indexed vector elements from two 256-bit vectors of
    +///    [4 x double] and interleaves them into a 256-bit vector of [4 x 
double].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VUNPCKHPD </c> instruction.
    +///
    +/// \param __a
    +///    A 256-bit floating-point vector of [4 x double]. \n
    +///    Bits [127:64] are written to bits [63:0] of the return value. \n
    +///    Bits [255:192] are written to bits [191:128] of the return value. \n
    +/// \param __b
    +///    A 256-bit floating-point vector of [4 x double]. \n
    +///    Bits [127:64] are written to bits [127:64] of the return value. \n
    +///    Bits [255:192] are written to bits [255:192] of the return value. \n
    +/// \returns A 256-bit vector of [4 x double] containing the interleaved 
values.
    +static __inline __m256d __DEFAULT_FN_ATTRS
    +_mm256_unpackhi_pd(__m256d __a, __m256d __b)
    +{
    +#ifdef __GNUC__
    +  return (__m256d) __builtin_ia32_unpckhpd256 ((__v4df)__a, (__v4df)__b);
    +#else
    +  return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2);
    +#endif
    +}
    +
    +/// Unpacks the even-indexed vector elements from two 256-bit vectors of
    +///    [4 x double] and interleaves them into a 256-bit vector of [4 x 
double].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VUNPCKLPD </c> instruction.
    +///
    +/// \param __a
    +///    A 256-bit floating-point vector of [4 x double]. \n
    +///    Bits [63:0] are written to bits [63:0] of the return value. \n
    +///    Bits [191:128] are written to bits [191:128] of the return value.
    +/// \param __b
    +///    A 256-bit floating-point vector of [4 x double]. \n
    +///    Bits [63:0] are written to bits [127:64] of the return value. \n
    +///    Bits [191:128] are written to bits [255:192] of the return value. \n
    +/// \returns A 256-bit vector of [4 x double] containing the interleaved 
values.
    +static __inline __m256d __DEFAULT_FN_ATTRS
    +_mm256_unpacklo_pd(__m256d __a, __m256d __b)
    +{
    +#ifdef __GNUC__
    +  return (__m256d) __builtin_ia32_unpcklpd256 ((__v4df)__a, (__v4df)__b);
    +#else
    +  return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2);
    +#endif
    +}
    +
    +/// Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the
    +///    two 256-bit vectors of [8 x float] and interleaves them into a 
256-bit
    +///    vector of [8 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VUNPCKHPS </c> instruction.
    +///
    +/// \param __a
    +///    A 256-bit vector of [8 x float]. \n
    +///    Bits [95:64] are written to bits [31:0] of the return value. \n
    +///    Bits [127:96] are written to bits [95:64] of the return value. \n
    +///    Bits [223:192] are written to bits [159:128] of the return value. \n
    +///    Bits [255:224] are written to bits [223:192] of the return value.
    +/// \param __b
    +///    A 256-bit vector of [8 x float]. \n
    +///    Bits [95:64] are written to bits [63:32] of the return value. \n
    +///    Bits [127:96] are written to bits [127:96] of the return value. \n
    +///    Bits [223:192] are written to bits [191:160] of the return value. \n
    +///    Bits [255:224] are written to bits [255:224] of the return value.
    +/// \returns A 256-bit vector of [8 x float] containing the interleaved 
values.
    +static __inline __m256 __DEFAULT_FN_ATTRS
    +_mm256_unpackhi_ps(__m256 __a, __m256 __b)
    +{
    +#ifdef __GNUC__
    +  return (__m256) __builtin_ia32_unpckhps256 ((__v8sf)__a, (__v8sf)__b);
    +#else
    +  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 
10+1, 6, 14, 6+1, 14+1);
    +#endif
    +}
    +
    +/// Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the
    +///    two 256-bit vectors of [8 x float] and interleaves them into a 
256-bit
    +///    vector of [8 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VUNPCKLPS </c> instruction.
    +///
    +/// \param __a
    +///    A 256-bit vector of [8 x float]. \n
    +///    Bits [31:0] are written to bits [31:0] of the return value. \n
    +///    Bits [63:32] are written to bits [95:64] of the return value. \n
    +///    Bits [159:128] are written to bits [159:128] of the return value. \n
    +///    Bits [191:160] are written to bits [223:192] of the return value.
    +/// \param __b
    +///    A 256-bit vector of [8 x float]. \n
    +///    Bits [31:0] are written to bits [63:32] of the return value. \n
    +///    Bits [63:32] are written to bits [127:96] of the return value. \n
    +///    Bits [159:128] are written to bits [191:160] of the return value. \n
    +///    Bits [191:160] are written to bits [255:224] of the return value.
    +/// \returns A 256-bit vector of [8 x float] containing the interleaved 
values.
    +static __inline __m256 __DEFAULT_FN_ATTRS
    +_mm256_unpacklo_ps(__m256 __a, __m256 __b)
    +{
    +#ifdef __GNUC__
    +  return (__m256) __builtin_ia32_unpcklps256 ((__v8sf)__a, (__v8sf)__b);
    +#else
    +  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 
4, 12, 4+1, 12+1);
    +#endif
    +}
    +
    +/* Bit Test */
    +/// Given two 128-bit floating-point vectors of [2 x double], perform an
    +///    element-by-element comparison of the double-precision element in the
    +///    first source vector and the corresponding element in the second 
source
    +///    vector.
    +///
    +///    The EFLAGS register is updated as follows: \n
    +///    If there is at least one pair of double-precision elements where the
    +///    sign-bits of both elements are 1, the ZF flag is set to 0. 
Otherwise the
    +///    ZF flag is set to 1. \n
    +///    If there is at least one pair of double-precision elements where the
    +///    sign-bit of the first element is 0 and the sign-bit of the second 
element
    +///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
    +///    This intrinsic returns the value of the ZF flag.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double].
    +/// \param __b
    +///    A 128-bit vector of [2 x double].
    +/// \returns the ZF flag in the EFLAGS register.
    +static __inline int __DEFAULT_FN_ATTRS128
    +_mm_testz_pd(__m128d __a, __m128d __b)
    +{
    +  return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
    +}
    +
    +/// Given two 128-bit floating-point vectors of [2 x double], perform an
    +///    element-by-element comparison of the double-precision element in the
    +///    first source vector and the corresponding element in the second 
source
    +///    vector.
    +///
    +///    The EFLAGS register is updated as follows: \n
    +///    If there is at least one pair of double-precision elements where the
    +///    sign-bits of both elements are 1, the ZF flag is set to 0. 
Otherwise the
    +///    ZF flag is set to 1. \n
    +///    If there is at least one pair of double-precision elements where the
    +///    sign-bit of the first element is 0 and the sign-bit of the second 
element
    +///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
    +///    This intrinsic returns the value of the CF flag.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double].
    +/// \param __b
    +///    A 128-bit vector of [2 x double].
    +/// \returns the CF flag in the EFLAGS register.
    +static __inline int __DEFAULT_FN_ATTRS128
    +_mm_testc_pd(__m128d __a, __m128d __b)
    +{
    +  return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
    +}
    +
    +/// Given two 128-bit floating-point vectors of [2 x double], perform an
    +///    element-by-element comparison of the double-precision element in the
    +///    first source vector and the corresponding element in the second 
source
    +///    vector.
    +///
    +///    The EFLAGS register is updated as follows: \n
    +///    If there is at least one pair of double-precision elements where the
    +///    sign-bits of both elements are 1, the ZF flag is set to 0. 
Otherwise the
    +///    ZF flag is set to 1. \n
    +///    If there is at least one pair of double-precision elements where the
    +///    sign-bit of the first element is 0 and the sign-bit of the second 
element
    +///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
    +///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
    +///    otherwise it returns 0.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double].
    +/// \param __b
    +///    A 128-bit vector of [2 x double].
    +/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 
0.
    +static __inline int __DEFAULT_FN_ATTRS128
    +_mm_testnzc_pd(__m128d __a, __m128d __b)
    +{
    +  return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
    +}
    +
    +/// Given two 128-bit floating-point vectors of [4 x float], perform an
    +///    element-by-element comparison of the single-precision element in the
    +///    first source vector and the corresponding element in the second 
source
    +///    vector.
    +///
    +///    The EFLAGS register is updated as follows: \n
    +///    If there is at least one pair of single-precision elements where the
    +///    sign-bits of both elements are 1, the ZF flag is set to 0. 
Otherwise the
    +///    ZF flag is set to 1. \n
    +///    If there is at least one pair of single-precision elements where the
    +///    sign-bit of the first element is 0 and the sign-bit of the second 
element
    +///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
    +///    This intrinsic returns the value of the ZF flag.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float].
    +/// \param __b
    +///    A 128-bit vector of [4 x float].
    +/// \returns the ZF flag.
    +static __inline int __DEFAULT_FN_ATTRS128
    +_mm_testz_ps(__m128 __a, __m128 __b)
    +{
    +  return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
    +}
    +
    +/// Given two 128-bit floating-point vectors of [4 x float], perform an
    +///    element-by-element comparison of the single-precision element in the
    +///    first source vector and the corresponding element in the second 
source
    +///    vector.
    +///
    +///    The EFLAGS register is updated as follows: \n
    +///    If there is at least one pair of single-precision elements where the
    +///    sign-bits of both elements are 1, the ZF flag is set to 0. 
Otherwise the
    +///    ZF flag is set to 1. \n
    +///    If there is at least one pair of single-precision elements where the
    +///    sign-bit of the first element is 0 and the sign-bit of the second 
element
    +///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
    +///    This intrinsic returns the value of the CF flag.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float].
    +/// \param __b
    +///    A 128-bit vector of [4 x float].
    +/// \returns the CF flag.
    +static __inline int __DEFAULT_FN_ATTRS128
    +_mm_testc_ps(__m128 __a, __m128 __b)
    +{
    +  return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
    +}
    +
    +/// Given two 128-bit floating-point vectors of [4 x float], perform an
    +///    element-by-element comparison of the single-precision element in the
    +///    first source vector and the corresponding element in the second 
source
    +///    vector.
    +///
    +///    The EFLAGS register is updated as follows: \n
    +///    If there is at least one pair of single-precision elements where the
    +///    sign-bits of both elements are 1, the ZF flag is set to 0. 
Otherwise the
    +///    ZF flag is set to 1. \n
    +///    If there is at least one pair of single-precision elements where the
    +///    sign-bit of the first element is 0 and the sign-bit of the second 
element
    +///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
    +///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
    +///    otherwise it returns 0.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float].
    +/// \param __b
    +///    A 128-bit vector of [4 x float].
    +/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 
0.
    +static __inline int __DEFAULT_FN_ATTRS128
    +_mm_testnzc_ps(__m128 __a, __m128 __b)
    +{
    +  return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
    +}
    +
    +/// Given two 256-bit floating-point vectors of [4 x double], perform an
    +///    element-by-element comparison of the double-precision elements in 
the
    +///    first source vector and the corresponding elements in the second 
source
    +///    vector.
    +///
    +///    The EFLAGS register is updated as follows: \n
    +///    If there is at least one pair of double-precision elements where the
    +///    sign-bits of both elements are 1, the ZF flag is set to 0. 
Otherwise the
    +///    ZF flag is set to 1. \n
    +///    If there is at least one pair of double-precision elements where the
    +///    sign-bit of the first element is 0 and the sign-bit of the second 
element
    +///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
    +///    This intrinsic returns the value of the ZF flag.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
    +///
    +/// \param __a
    +///    A 256-bit vector of [4 x double].
    +/// \param __b
    +///    A 256-bit vector of [4 x double].
    +/// \returns the ZF flag.
    +static __inline int __DEFAULT_FN_ATTRS
    +_mm256_testz_pd(__m256d __a, __m256d __b)
    +{
    +  return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
    +}
    +
    +/// Given two 256-bit floating-point vectors of [4 x double], perform an
    +///    element-by-element comparison of the double-precision elements in 
the
    +///    first source vector and the corresponding elements in the second 
source
    +///    vector.
    +///
    +///    The EFLAGS register is updated as follows: \n
    +///    If there is at least one pair of double-precision elements where the
    +///    sign-bits of both elements are 1, the ZF flag is set to 0. 
Otherwise the
    +///    ZF flag is set to 1. \n
    +///    If there is at least one pair of double-precision elements where the
    +///    sign-bit of the first element is 0 and the sign-bit of the second 
element
    +///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
    +///    This intrinsic returns the value of the CF flag.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
    +///
    +/// \param __a
    +///    A 256-bit vector of [4 x double].
    +/// \param __b
    +///    A 256-bit vector of [4 x double].
    +/// \returns the CF flag.
    +static __inline int __DEFAULT_FN_ATTRS
    +_mm256_testc_pd(__m256d __a, __m256d __b)
    +{
    +  return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
    +}
    +
    +/// Given two 256-bit floating-point vectors of [4 x double], perform an
    +///    element-by-element comparison of the double-precision elements in 
the
    +///    first source vector and the corresponding elements in the second 
source
    +///    vector.
    +///
    +///    The EFLAGS register is updated as follows: \n
    +///    If there is at least one pair of double-precision elements where the
    +///    sign-bits of both elements are 1, the ZF flag is set to 0. 
Otherwise the
    +///    ZF flag is set to 1. \n
    +///    If there is at least one pair of double-precision elements where the
    +///    sign-bit of the first element is 0 and the sign-bit of the second 
element
    +///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
    +///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
    +///    otherwise it returns 0.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
    +///
    +/// \param __a
    +///    A 256-bit vector of [4 x double].
    +/// \param __b
    +///    A 256-bit vector of [4 x double].
    +/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 
0.
    +static __inline int __DEFAULT_FN_ATTRS
    +_mm256_testnzc_pd(__m256d __a, __m256d __b)
    +{
    +  return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
    +}
    +
    +/// Given two 256-bit floating-point vectors of [8 x float], perform an
    +///    element-by-element comparison of the single-precision element in the
    +///    first source vector and the corresponding element in the second 
source
    +///    vector.
    +///
    +///    The EFLAGS register is updated as follows: \n
    +///    If there is at least one pair of single-precision elements where the
    +///    sign-bits of both elements are 1, the ZF flag is set to 0. 
Otherwise the
    +///    ZF flag is set to 1. \n
    +///    If there is at least one pair of single-precision elements where the
    +///    sign-bit of the first element is 0 and the sign-bit of the second 
element
    +///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
    +///    This intrinsic returns the value of the ZF flag.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
    +///
    +/// \param __a
    +///    A 256-bit vector of [8 x float].
    +/// \param __b
    +///    A 256-bit vector of [8 x float].
    +/// \returns the ZF flag.
    +static __inline int __DEFAULT_FN_ATTRS
    +_mm256_testz_ps(__m256 __a, __m256 __b)
    +{
    +  return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
    +}
    +
    +/// Given two 256-bit floating-point vectors of [8 x float], perform an
    +///    element-by-element comparison of the single-precision element in the
    +///    first source vector and the corresponding element in the second 
source
    +///    vector.
    +///
    +///    The EFLAGS register is updated as follows: \n
    +///    If there is at least one pair of single-precision elements where the
    +///    sign-bits of both elements are 1, the ZF flag is set to 0. 
Otherwise the
    +///    ZF flag is set to 1. \n
    +///    If there is at least one pair of single-precision elements where the
    +///    sign-bit of the first element is 0 and the sign-bit of the second 
element
    +///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
    +///    This intrinsic returns the value of the CF flag.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
    +///
    +/// \param __a
    +///    A 256-bit vector of [8 x float].
    +/// \param __b
    +///    A 256-bit vector of [8 x float].
    +/// \returns the CF flag.
    +static __inline int __DEFAULT_FN_ATTRS
    +_mm256_testc_ps(__m256 __a, __m256 __b)
    +{
    +  return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
    +}
    +
    +/// Given two 256-bit floating-point vectors of [8 x float], perform an
    +///    element-by-element comparison of the single-precision elements in 
the
    +///    first source vector and the corresponding elements in the second 
source
    +///    vector.
    +///
    +///    The EFLAGS register is updated as follows: \n
    +///    If there is at least one pair of single-precision elements where the
    +///    sign-bits of both elements are 1, the ZF flag is set to 0. 
Otherwise the
    +///    ZF flag is set to 1. \n
    +///    If there is at least one pair of single-precision elements where the
    +///    sign-bit of the first element is 0 and the sign-bit of the second 
element
    +///    is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
    +///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
    +///    otherwise it returns 0.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
    +///
    +/// \param __a
    +///    A 256-bit vector of [8 x float].
    +/// \param __b
    +///    A 256-bit vector of [8 x float].
    +/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 
0.
    +static __inline int __DEFAULT_FN_ATTRS
    +_mm256_testnzc_ps(__m256 __a, __m256 __b)
    +{
    +  return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
    +}
    +
    +/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
    +///    of the two source vectors.
    +///
    +///    The EFLAGS register is updated as follows: \n
    +///    If there is at least one pair of bits where both bits are 1, the ZF 
flag
    +///    is set to 0. Otherwise the ZF flag is set to 1. \n
    +///    If there is at least one pair of bits where the bit from the first 
source
    +///    vector is 0 and the bit from the second source vector is 1, the CF 
flag
    +///    is set to 0. Otherwise the CF flag is set to 1. \n
    +///    This intrinsic returns the value of the ZF flag.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
    +///
    +/// \param __a
    +///    A 256-bit integer vector.
    +/// \param __b
    +///    A 256-bit integer vector.
    +/// \returns the ZF flag.
    +static __inline int __DEFAULT_FN_ATTRS
    +_mm256_testz_si256(__m256i __a, __m256i __b)
    +{
    +  return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
    +}
    +
    +/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
    +///    of the two source vectors.
    +///
    +///    The EFLAGS register is updated as follows: \n
    +///    If there is at least one pair of bits where both bits are 1, the ZF 
flag
    +///    is set to 0. Otherwise the ZF flag is set to 1. \n
    +///    If there is at least one pair of bits where the bit from the first 
source
    +///    vector is 0 and the bit from the second source vector is 1, the CF 
flag
    +///    is set to 0. Otherwise the CF flag is set to 1. \n
    +///    This intrinsic returns the value of the CF flag.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
    +///
    +/// \param __a
    +///    A 256-bit integer vector.
    +/// \param __b
    +///    A 256-bit integer vector.
    +/// \returns the CF flag.
    +static __inline int __DEFAULT_FN_ATTRS
    +_mm256_testc_si256(__m256i __a, __m256i __b)
    +{
    +  return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
    +}
    +
    +/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
    +///    of the two source vectors.
    +///
    +///    The EFLAGS register is updated as follows: \n
    +///    If there is at least one pair of bits where both bits are 1, the ZF 
flag
    +///    is set to 0. Otherwise the ZF flag is set to 1. \n
    +///    If there is at least one pair of bits where the bit from the first 
source
    +///    vector is 0 and the bit from the second source vector is 1, the CF 
flag
    +///    is set to 0. Otherwise the CF flag is set to 1. \n
    +///    This intrinsic returns 1 if both the ZF and CF flags are set to 0,
    +///    otherwise it returns 0.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
    +///
    +/// \param __a
    +///    A 256-bit integer vector.
    +/// \param __b
    +///    A 256-bit integer vector.
    +/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 
0.
    +static __inline int __DEFAULT_FN_ATTRS
    +_mm256_testnzc_si256(__m256i __a, __m256i __b)
    +{
    +  return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
    +}
    +
    +/* Vector extract sign mask */
    +/// Extracts the sign bits of double-precision floating point elements
    +///    in a 256-bit vector of [4 x double] and writes them to the lower 
order
    +///    bits of the return value.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVMSKPD </c> instruction.
    +///
    +/// \param __a
    +///    A 256-bit vector of [4 x double] containing the double-precision
    +///    floating point values with sign bits to be extracted.
    +/// \returns The sign bits from the operand, written to bits [3:0].
    +static __inline int __DEFAULT_FN_ATTRS
    +_mm256_movemask_pd(__m256d __a)
    +{
    +  return __builtin_ia32_movmskpd256((__v4df)__a);
    +}
    +
    +/// Extracts the sign bits of single-precision floating point elements
    +///    in a 256-bit vector of [8 x float] and writes them to the lower 
order
    +///    bits of the return value.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVMSKPS </c> instruction.
    +///
    +/// \param __a
    +///    A 256-bit vector of [8 x float] containing the single-precision 
floating
    +///    point values with sign bits to be extracted.
    +/// \returns The sign bits from the operand, written to bits [7:0].
    +static __inline int __DEFAULT_FN_ATTRS
    +_mm256_movemask_ps(__m256 __a)
    +{
    +  return __builtin_ia32_movmskps256((__v8sf)__a);
    +}
    +
    +/* Vector __zero */
    +/// Zeroes the contents of all XMM or YMM registers.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VZEROALL </c> instruction.
    +#ifdef __GNUC__
    +static __inline void __DEFAULT_FN_ATTRS
    +#else
    +static __inline void __attribute__((__always_inline__, __nodebug__, 
__target__("avx")))
    +#endif
    +_mm256_zeroall(void)
    +{
    +  __builtin_ia32_vzeroall();
    +}
    +
    +/// Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VZEROUPPER </c> instruction.
    +//
    +#ifdef __GNUC__
    +static __inline void __DEFAULT_FN_ATTRS
    +#else
    +static __inline void __attribute__((__always_inline__, __nodebug__, 
__target__("avx")))
    +#endif
    +_mm256_zeroupper(void)
    +{
    +  __builtin_ia32_vzeroupper();
    +}
    +
    +/* Vector load with broadcast */
    +/// Loads a scalar single-precision floating point value from the
    +///    specified address pointed to by \a __a and broadcasts it to the 
elements
    +///    of a [4 x float] vector.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
    +///
    +/// \param __a
    +///    The single-precision floating point value to be broadcast.
    +/// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set
    +///    equal to the broadcast value.
    +static __inline __m128 __DEFAULT_FN_ATTRS128
    +_mm_broadcast_ss(float const *__a)
    +{
    +  float __f = *__a;
    +  return __extension__ (__m128)(__v4sf){ __f, __f, __f, __f };
    +}
    +
    +/// Loads a scalar double-precision floating point value from the
    +///    specified address pointed to by \a __a and broadcasts it to the 
elements
    +///    of a [4 x double] vector.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VBROADCASTSD </c> instruction.
    +///
    +/// \param __a
    +///    The double-precision floating point value to be broadcast.
    +/// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set
    +///    equal to the broadcast value.
    +static __inline __m256d __DEFAULT_FN_ATTRS
    +_mm256_broadcast_sd(double const *__a)
    +{
    +  double __d = *__a;
    +  return __extension__ (__m256d)(__v4df){ __d, __d, __d, __d };
    +}
    +
    +/// Loads a scalar single-precision floating point value from the
    +///    specified address pointed to by \a __a and broadcasts it to the 
elements
    +///    of a [8 x float] vector.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
    +///
    +/// \param __a
    +///    The single-precision floating point value to be broadcast.
    +/// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set
    +///    equal to the broadcast value.
    +static __inline __m256 __DEFAULT_FN_ATTRS
    +_mm256_broadcast_ss(float const *__a)
    +{
    +  float __f = *__a;
    +  return __extension__ (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, 
__f, __f };
    +}
    +
    +/// Loads the data from a 128-bit vector of [2 x double] from the
    +///    specified address pointed to by \a __a and broadcasts it to 128-bit
    +///    elements in a 256-bit vector of [4 x double].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
    +///
    +/// \param __a
    +///    The 128-bit vector of [2 x double] to be broadcast.
    +/// \returns A 256-bit vector of [4 x double] whose 128-bit elements are 
set
    +///    equal to the broadcast value.
    +static __inline __m256d __DEFAULT_FN_ATTRS
    +_mm256_broadcast_pd(__m128d const *__a)
    +{
    +#ifdef __GNUC__
    +  return (__m256d) __builtin_ia32_vbroadcastf128_pd256 (__a);
    +#else
    +  __m128d __b = _mm_loadu_pd((const double *)__a);
    +  return (__m256d)__builtin_shufflevector((__v2df)__b, (__v2df)__b,
    +                                          0, 1, 0, 1);
    +#endif
    +}
    +
    +/// Loads the data from a 128-bit vector of [4 x float] from the
    +///    specified address pointed to by \a __a and broadcasts it to 128-bit
    +///    elements in a 256-bit vector of [8 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
    +///
    +/// \param __a
    +///    The 128-bit vector of [4 x float] to be broadcast.
    +/// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set
    +///    equal to the broadcast value.
    +static __inline __m256 __DEFAULT_FN_ATTRS
    +_mm256_broadcast_ps(__m128 const *__a)
    +{
    +#ifdef __GNUC__
    +  return (__m256) __builtin_ia32_vbroadcastf128_ps256 (__a);
    +#else
    +  __m128 __b = _mm_loadu_ps((const float *)__a);
    +  return (__m256)__builtin_shufflevector((__v4sf)__b, (__v4sf)__b,
    +                                         0, 1, 2, 3, 0, 1, 2, 3);
    +#endif
    +}
    +
    +/* SIMD load ops */
    +/// Loads 4 double-precision floating point values from a 32-byte aligned
    +///    memory location pointed to by \a __p into a vector of [4 x double].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
    +///
    +/// \param __p
    +///    A 32-byte aligned pointer to a memory location containing
    +///    double-precision floating point values.
    +/// \returns A 256-bit vector of [4 x double] containing the moved values.
    +static __inline __m256d __DEFAULT_FN_ATTRS
    +_mm256_load_pd(double const *__p)
    +{
    +  return *(__m256d *)__p;
    +}
    +
    +/// Loads 8 single-precision floating point values from a 32-byte aligned
    +///    memory location pointed to by \a __p into a vector of [8 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
    +///
    +/// \param __p
    +///    A 32-byte aligned pointer to a memory location containing float 
values.
    +/// \returns A 256-bit vector of [8 x float] containing the moved values.
    +static __inline __m256 __DEFAULT_FN_ATTRS
    +_mm256_load_ps(float const *__p)
    +{
    +  return *(__m256 *)__p;
    +}
    +
    +/// Loads 4 double-precision floating point values from an unaligned
    +///    memory location pointed to by \a __p into a vector of [4 x double].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
    +///
    +/// \param __p
    +///    A pointer to a memory location containing double-precision floating
    +///    point values.
    +/// \returns A 256-bit vector of [4 x double] containing the moved values.
    +static __inline __m256d __DEFAULT_FN_ATTRS
    +_mm256_loadu_pd(double const *__p)
    +{
    +  struct __loadu_pd {
    +    __m256d __v;
    +  } __attribute__((__packed__, __may_alias__));
    +  return ((struct __loadu_pd*)__p)->__v;
    +}
    +
    +/// Loads 8 single-precision floating point values from an unaligned
    +///    memory location pointed to by \a __p into a vector of [8 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
    +///
    +/// \param __p
    +///    A pointer to a memory location containing single-precision floating
    +///    point values.
    +/// \returns A 256-bit vector of [8 x float] containing the moved values.
    +static __inline __m256 __DEFAULT_FN_ATTRS
    +_mm256_loadu_ps(float const *__p)
    +{
    +  struct __loadu_ps {
    +    __m256 __v;
    +  } __attribute__((__packed__, __may_alias__));
    +  return ((struct __loadu_ps*)__p)->__v;
    +}
    +
    +/// Loads 256 bits of integer data from a 32-byte aligned memory
    +///    location pointed to by \a __p into elements of a 256-bit integer 
vector.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
    +///
    +/// \param __p
    +///    A 32-byte aligned pointer to a 256-bit integer vector containing 
integer
    +///    values.
    +/// \returns A 256-bit integer vector containing the moved values.
    +static __inline __m256i __DEFAULT_FN_ATTRS
    +_mm256_load_si256(__m256i const *__p)
    +{
    +  return *__p;
    +}
    +
    +/// Loads 256 bits of integer data from an unaligned memory location
    +///    pointed to by \a __p into a 256-bit integer vector.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
    +///
    +/// \param __p
    +///    A pointer to a 256-bit integer vector containing integer values.
    +/// \returns A 256-bit integer vector containing the moved values.
    +static __inline __m256i __DEFAULT_FN_ATTRS
    +_mm256_loadu_si256(__m256i const *__p)
    +{
    +  struct __loadu_si256 {
    +    __m256i __v;
    +  } __attribute__((__packed__, __may_alias__));
    +  return ((struct __loadu_si256*)__p)->__v;
    +}
    +
    +/// Loads 256 bits of integer data from an unaligned memory location
    +///    pointed to by \a __p into a 256-bit integer vector. This intrinsic 
may
    +///    perform better than \c _mm256_loadu_si256 when the data crosses a 
cache
    +///    line boundary.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
    +///
    +/// \param __p
    +///    A pointer to a 256-bit integer vector containing integer values.
    +/// \returns A 256-bit integer vector containing the moved values.
    +static __inline __m256i __DEFAULT_FN_ATTRS
    +_mm256_lddqu_si256(__m256i const *__p)
    +{
    +  return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
    +}
    +
    +/* SIMD store ops */
    +/// Stores double-precision floating point values from a 256-bit vector
    +///    of [4 x double] to a 32-byte aligned memory location pointed to by
    +///    \a __p.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
    +///
    +/// \param __p
    +///    A 32-byte aligned pointer to a memory location that will receive the
    +///    double-precision floaing point values.
    +/// \param __a
    +///    A 256-bit vector of [4 x double] containing the values to be moved.
    +static __inline void __DEFAULT_FN_ATTRS
    +_mm256_store_pd(double *__p, __m256d __a)
    +{
    +  *(__m256d *)__p = __a;
    +}
    +
    +/// Stores single-precision floating point values from a 256-bit vector
    +///    of [8 x float] to a 32-byte aligned memory location pointed to by 
\a __p.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
    +///
    +/// \param __p
    +///    A 32-byte aligned pointer to a memory location that will receive the
    +///    float values.
    +/// \param __a
    +///    A 256-bit vector of [8 x float] containing the values to be moved.
    +static __inline void __DEFAULT_FN_ATTRS
    +_mm256_store_ps(float *__p, __m256 __a)
    +{
    +  *(__m256 *)__p = __a;
    +}
    +
    +/// Stores double-precision floating point values from a 256-bit vector
    +///    of [4 x double] to an unaligned memory location pointed to by \a 
__p.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
    +///
    +/// \param __p
    +///    A pointer to a memory location that will receive the 
double-precision
    +///    floating point values.
    +/// \param __a
    +///    A 256-bit vector of [4 x double] containing the values to be moved.
    +static __inline void __DEFAULT_FN_ATTRS
    +_mm256_storeu_pd(double *__p, __m256d __a)
    +{
    +  struct __storeu_pd {
    +    __m256d __v;
    +  } __attribute__((__packed__, __may_alias__));
    +  ((struct __storeu_pd*)__p)->__v = __a;
    +}
    +
    +/// Stores single-precision floating point values from a 256-bit vector
    +///    of [8 x float] to an unaligned memory location pointed to by \a __p.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
    +///
    +/// \param __p
    +///    A pointer to a memory location that will receive the float values.
    +/// \param __a
    +///    A 256-bit vector of [8 x float] containing the values to be moved.
    +static __inline void __DEFAULT_FN_ATTRS
    +_mm256_storeu_ps(float *__p, __m256 __a)
    +{
    +  struct __storeu_ps {
    +    __m256 __v;
    +  } __attribute__((__packed__, __may_alias__));
    +  ((struct __storeu_ps*)__p)->__v = __a;
    +}
    +
    +/// Stores integer values from a 256-bit integer vector to a 32-byte
    +///    aligned memory location pointed to by \a __p.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
    +///
    +/// \param __p
    +///    A 32-byte aligned pointer to a memory location that will receive the
    +///    integer values.
    +/// \param __a
    +///    A 256-bit integer vector containing the values to be moved.
    +static __inline void __DEFAULT_FN_ATTRS
    +_mm256_store_si256(__m256i *__p, __m256i __a)
    +{
    +  *__p = __a;
    +}
    +
    +/// Stores integer values from a 256-bit integer vector to an unaligned
    +///    memory location pointed to by \a __p.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
    +///
    +/// \param __p
    +///    A pointer to a memory location that will receive the integer values.
    +/// \param __a
    +///    A 256-bit integer vector containing the values to be moved.
    +static __inline void __DEFAULT_FN_ATTRS
    +_mm256_storeu_si256(__m256i *__p, __m256i __a)
    +{
    +  struct __storeu_si256 {
    +    __m256i __v;
    +  } __attribute__((__packed__, __may_alias__));
    +  ((struct __storeu_si256*)__p)->__v = __a;
    +}
    +
    +/* Conditional load ops */
    +/// Conditionally loads double-precision floating point elements from a
    +///    memory location pointed to by \a __p into a 128-bit vector of
    +///    [2 x double], depending on the mask bits associated with each data
    +///    element.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
    +///
    +/// \param __p
    +///    A pointer to a memory location that contains the double-precision
    +///    floating point values.
    +/// \param __m
    +///    A 128-bit integer vector containing the mask. The most significant 
bit of
    +///    each data element represents the mask bits. If a mask bit is zero, 
the
    +///    corresponding value in the memory location is not loaded and the
    +///    corresponding field in the return value is set to zero.
    +/// \returns A 128-bit vector of [2 x double] containing the loaded values.
    +static __inline __m128d __DEFAULT_FN_ATTRS128
    +_mm_maskload_pd(double const *__p, __m128i __m)
    +{
    +  return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, 
(__v2di)__m);
    +}
    +
    +/// Conditionally loads double-precision floating point elements from a
    +///    memory location pointed to by \a __p into a 256-bit vector of
    +///    [4 x double], depending on the mask bits associated with each data
    +///    element.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
    +///
    +/// \param __p
    +///    A pointer to a memory location that contains the double-precision
    +///    floating point values.
    +/// \param __m
    +///    A 256-bit integer vector of [4 x quadword] containing the mask. The 
most
    +///    significant bit of each quadword element represents the mask bits. 
If a
    +///    mask bit is zero, the corresponding value in the memory location is 
not
    +///    loaded and the corresponding field in the return value is set to 
zero.
    +/// \returns A 256-bit vector of [4 x double] containing the loaded values.
    +static __inline __m256d __DEFAULT_FN_ATTRS
    +_mm256_maskload_pd(double const *__p, __m256i __m)
    +{
    +  return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
    +                                               (__v4di)__m);
    +}
    +
    +/// Conditionally loads single-precision floating point elements from a
    +///    memory location pointed to by \a __p into a 128-bit vector of
    +///    [4 x float], depending on the mask bits associated with each data
    +///    element.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
    +///
    +/// \param __p
    +///    A pointer to a memory location that contains the single-precision
    +///    floating point values.
    +/// \param __m
    +///    A 128-bit integer vector containing the mask. The most significant 
bit of
    +///    each data element represents the mask bits. If a mask bit is zero, 
the
    +///    corresponding value in the memory location is not loaded and the
    +///    corresponding field in the return value is set to zero.
    +/// \returns A 128-bit vector of [4 x float] containing the loaded values.
    +static __inline __m128 __DEFAULT_FN_ATTRS128
    +_mm_maskload_ps(float const *__p, __m128i __m)
    +{
    +  return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, 
(__v4si)__m);
    +}
    +
    +/// Conditionally loads single-precision floating point elements from a
    +///    memory location pointed to by \a __p into a 256-bit vector of
    +///    [8 x float], depending on the mask bits associated with each data
    +///    element.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
    +///
    +/// \param __p
    +///    A pointer to a memory location that contains the single-precision
    +///    floating point values.
    +/// \param __m
    +///    A 256-bit integer vector of [8 x dword] containing the mask. The 
most
    +///    significant bit of each dword element represents the mask bits. If 
a mask
    +///    bit is zero, the corresponding value in the memory location is not 
loaded
    +///    and the corresponding field in the return value is set to zero.
    +/// \returns A 256-bit vector of [8 x float] containing the loaded values.
    +static __inline __m256 __DEFAULT_FN_ATTRS
    +_mm256_maskload_ps(float const *__p, __m256i __m)
    +{
    +  return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, 
(__v8si)__m);
    +}
    +
    +/* Conditional store ops */
    +/// Moves single-precision floating point values from a 256-bit vector
    +///    of [8 x float] to a memory location pointed to by \a __p, according 
to
    +///    the specified mask.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
    +///
    +/// \param __p
    +///    A pointer to a memory location that will receive the float values.
    +/// \param __m
    +///    A 256-bit integer vector of [8 x dword] containing the mask. The 
most
    +///    significant bit of each dword element in the mask vector represents 
the
    +///    mask bits. If a mask bit is zero, the corresponding value from 
vector
    +///    \a __a is not stored and the corresponding field in the memory 
location
    +///    pointed to by \a __p is not changed.
    +/// \param __a
    +///    A 256-bit vector of [8 x float] containing the values to be stored.
    +static __inline void __DEFAULT_FN_ATTRS
    +_mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
    +{
    +  __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a);
    +}
    +
    +/// Moves double-precision values from a 128-bit vector of [2 x double]
    +///    to a memory location pointed to by \a __p, according to the 
specified
    +///    mask.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
    +///
    +/// \param __p
    +///    A pointer to a memory location that will receive the float values.
    +/// \param __m
    +///    A 128-bit integer vector containing the mask. The most significant 
bit of
    +///    each field in the mask vector represents the mask bits. If a mask 
bit is
    +///    zero, the corresponding value from vector \a __a is not stored and 
the
    +///    corresponding field in the memory location pointed to by \a __p is 
not
    +///    changed.
    +/// \param __a
    +///    A 128-bit vector of [2 x double] containing the values to be stored.
    +static __inline void __DEFAULT_FN_ATTRS128
    +_mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
    +{
    +  __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a);
    +}
    +
    +/// Moves double-precision values from a 256-bit vector of [4 x double]
    +///    to a memory location pointed to by \a __p, according to the 
specified
    +///    mask.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
    +///
    +/// \param __p
    +///    A pointer to a memory location that will receive the float values.
    +/// \param __m
    +///    A 256-bit integer vector of [4 x quadword] containing the mask. The 
most
    +///    significant bit of each quadword element in the mask vector 
represents
    +///    the mask bits. If a mask bit is zero, the corresponding value from 
vector
    +///    __a is not stored and the corresponding field in the memory location
    +///    pointed to by \a __p is not changed.
    +/// \param __a
    +///    A 256-bit vector of [4 x double] containing the values to be stored.
    +static __inline void __DEFAULT_FN_ATTRS
    +_mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
    +{
    +  __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a);
    +}
    +
    +/// Moves single-precision floating point values from a 128-bit vector
    +///    of [4 x float] to a memory location pointed to by \a __p, according 
to
    +///    the specified mask.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
    +///
    +/// \param __p
    +///    A pointer to a memory location that will receive the float values.
    +/// \param __m
    +///    A 128-bit integer vector containing the mask. The most significant 
bit of
    +///    each field in the mask vector represents the mask bits. If a mask 
bit is
    +///    zero, the corresponding value from vector __a is not stored and the
    +///    corresponding field in the memory location pointed to by \a __p is 
not
    +///    changed.
    +/// \param __a
    +///    A 128-bit vector of [4 x float] containing the values to be stored.
    +static __inline void __DEFAULT_FN_ATTRS128
    +_mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
    +{
    +  __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a);
    +}
    +
    +/* Cacheability support ops */
    +/// Moves integer data from a 256-bit integer vector to a 32-byte
    +///    aligned memory location. To minimize caching, the data is flagged as
    +///    non-temporal (unlikely to be used again soon).
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVNTDQ </c> instruction.
    +///
    +/// \param __a
    +///    A pointer to a 32-byte aligned memory location that will receive the
    +///    integer values.
    +/// \param __b
    +///    A 256-bit integer vector containing the values to be moved.
    +static __inline void __DEFAULT_FN_ATTRS
    +_mm256_stream_si256(__m256i *__a, __m256i __b)
    +{
    +#ifdef __GNUC__
    +  __builtin_ia32_movntdq256 ((__v4di *)__a, (__v4di)__b);
    +#else
    +  typedef __v4di __v4di_aligned __attribute__((aligned(32)));
    +  __builtin_nontemporal_store((__v4di_aligned)__b, (__v4di_aligned*)__a);
    +#endif
    +}
    +
    +/// Moves double-precision values from a 256-bit vector of [4 x double]
    +///    to a 32-byte aligned memory location. To minimize caching, the data 
is
    +///    flagged as non-temporal (unlikely to be used again soon).
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVNTPD </c> instruction.
    +///
    +/// \param __a
    +///    A pointer to a 32-byte aligned memory location that will receive the
    +///    double-precision floating-point values.
    +/// \param __b
    +///    A 256-bit vector of [4 x double] containing the values to be moved.
    +static __inline void __DEFAULT_FN_ATTRS
    +_mm256_stream_pd(double *__a, __m256d __b)
    +{
    +#ifdef __GNUC__
    +  __builtin_ia32_movntpd256 (__a, (__v4df)__b);
    +#else
    +  typedef __v4df __v4df_aligned __attribute__((aligned(32)));
    +  __builtin_nontemporal_store((__v4df_aligned)__b, (__v4df_aligned*)__a);
    +#endif
    +}
    +
    +/// Moves single-precision floating point values from a 256-bit vector
    +///    of [8 x float] to a 32-byte aligned memory location. To minimize
    +///    caching, the data is flagged as non-temporal (unlikely to be used 
again
    +///    soon).
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVNTPS </c> instruction.
    +///
    +/// \param __p
    +///    A pointer to a 32-byte aligned memory location that will receive the
    +///    single-precision floating point values.
    +/// \param __a
    +///    A 256-bit vector of [8 x float] containing the values to be moved.
    +static __inline void __DEFAULT_FN_ATTRS
    +_mm256_stream_ps(float *__p, __m256 __a)
    +{
    +#ifdef __GNUC__
    +  __builtin_ia32_movntps256 (__p, (__v8sf)__a);
    +#else
    +  typedef __v8sf __v8sf_aligned __attribute__((aligned(32)));
    +  __builtin_nontemporal_store((__v8sf_aligned)__a, (__v8sf_aligned*)__p);
    +#endif
    +}
    +
    +/* Create vectors */
    +/// Create a 256-bit vector of [4 x double] with undefined values.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic has no corresponding instruction.
    +///
    +/// \returns A 256-bit vector of [4 x double] containing undefined values.
    +static __inline__ __m256d __DEFAULT_FN_ATTRS
    +_mm256_undefined_pd(void)
    +{
    +#ifdef __GNUC__
    +  __m256d __X = __X;
    +  return __X;
    +#else
    +  return (__m256d)__builtin_ia32_undef256();
    +#endif
    +}
    +
    +/// Create a 256-bit vector of [8 x float] with undefined values.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic has no corresponding instruction.
    +///
    +/// \returns A 256-bit vector of [8 x float] containing undefined values.
    +static __inline__ __m256 __DEFAULT_FN_ATTRS
    +_mm256_undefined_ps(void)
    +{
    +#ifdef __GNUC__
    +  __m256 __X = __X;
    +  return __X;
    +#else
    +  return (__m256)__builtin_ia32_undef256();
    +#endif
    +}
    +
    +/// Create a 256-bit integer vector with undefined values.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic has no corresponding instruction.
    +///
    +/// \returns A 256-bit integer vector containing undefined values.
    +static __inline__ __m256i __DEFAULT_FN_ATTRS
    +_mm256_undefined_si256(void)
    +{
    +#ifdef __GNUC__
    +  __m256i __X = __X;
    +  return __X;
    +#else
    +  return (__m256i)__builtin_ia32_undef256();
    +#endif
    +}
    +
    +/// Constructs a 256-bit floating-point vector of [4 x double]
    +///    initialized with the specified double-precision floating-point 
values.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
    +///   instruction.
    +///
    +/// \param __a
    +///    A double-precision floating-point value used to initialize bits 
[255:192]
    +///    of the result.
    +/// \param __b
    +///    A double-precision floating-point value used to initialize bits 
[191:128]
    +///    of the result.
    +/// \param __c
    +///    A double-precision floating-point value used to initialize bits 
[127:64]
    +///    of the result.
    +/// \param __d
    +///    A double-precision floating-point value used to initialize bits 
[63:0]
    +///    of the result.
    +/// \returns An initialized 256-bit floating-point vector of [4 x double].
    +static __inline __m256d __DEFAULT_FN_ATTRS
    +_mm256_set_pd(double __a, double __b, double __c, double __d)
    +{
    +  return __extension__ (__m256d){ __d, __c, __b, __a };
    +}
    +
    +/// Constructs a 256-bit floating-point vector of [8 x float] initialized
    +///    with the specified single-precision floating-point values.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic is a utility function and does not correspond to a 
specific
    +///   instruction.
    +///
    +/// \param __a
    +///    A single-precision floating-point value used to initialize bits 
[255:224]
    +///    of the result.
    +/// \param __b
    +///    A single-precision floating-point value used to initialize bits 
[223:192]
    +///    of the result.
    +/// \param __c
    +///    A single-precision floating-point value used to initialize bits 
[191:160]
    +///    of the result.
    +/// \param __d
    +///    A single-precision floating-point value used to initialize bits 
[159:128]
    +///    of the result.
    +/// \param __e
    +///    A single-precision floating-point value used to initialize bits 
[127:96]
    +///    of the result.
    +/// \param __f
    +///    A single-precision floating-point value used to initialize bits 
[95:64]
    +///    of the result.
    +/// \param __g
    +///    A single-precision floating-point value used to initialize bits 
[63:32]
    +///    of the result.
    +/// \param __h
    +///    A single-precision floating-point value used to initialize bits 
[31:0]
    +///    of the result.
    +/// \returns An initialized 256-bit floating-point vector of [8 x float].
    +static __inline __m256 __DEFAULT_FN_ATTRS
    +_mm256_set_ps(float __a, float __b, float __c, float __d,
    +              float __e, float __f, float __g, float __h)
    +{
    +  return __extension__ (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
    +}
    +
    +/// Constructs a 256-bit integer vector initialized with the specified
    +///    32-bit integral values.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic is a utility function and does not correspond to a 
specific
    +///   instruction.
    +///
    +/// \param __i0
    +///    A 32-bit integral value used to initialize bits [255:224] of the 
result.
    +/// \param __i1
    +///    A 32-bit integral value used to initialize bits [223:192] of the 
result.
    +/// \param __i2
    +///    A 32-bit integral value used to initialize bits [191:160] of the 
result.
    +/// \param __i3
    +///    A 32-bit integral value used to initialize bits [159:128] of the 
result.
    +/// \param __i4
    +///    A 32-bit integral value used to initialize bits [127:96] of the 
result.
    +/// \param __i5
    +///    A 32-bit integral value used to initialize bits [95:64] of the 
result.
    +/// \param __i6
    +///    A 32-bit integral value used to initialize bits [63:32] of the 
result.
    +/// \param __i7
    +///    A 32-bit integral value used to initialize bits [31:0] of the 
result.
    +/// \returns An initialized 256-bit integer vector.
    +static __inline __m256i __DEFAULT_FN_ATTRS
    +_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
    +                 int __i4, int __i5, int __i6, int __i7)
    +{
    +  return __extension__ (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, 
__i2, __i1, __i0 };
    +}
    +
    +/// Constructs a 256-bit integer vector initialized with the specified
    +///    16-bit integral values.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic is a utility function and does not correspond to a 
specific
    +///   instruction.
    +///
    +/// \param __w15
    +///    A 16-bit integral value used to initialize bits [255:240] of the 
result.
    +/// \param __w14
    +///    A 16-bit integral value used to initialize bits [239:224] of the 
result.
    +/// \param __w13
    +///    A 16-bit integral value used to initialize bits [223:208] of the 
result.
    +/// \param __w12
    +///    A 16-bit integral value used to initialize bits [207:192] of the 
result.
    +/// \param __w11
    +///    A 16-bit integral value used to initialize bits [191:176] of the 
result.
    +/// \param __w10
    +///    A 16-bit integral value used to initialize bits [175:160] of the 
result.
    +/// \param __w09
    +///    A 16-bit integral value used to initialize bits [159:144] of the 
result.
    +/// \param __w08
    +///    A 16-bit integral value used to initialize bits [143:128] of the 
result.
    +/// \param __w07
    +///    A 16-bit integral value used to initialize bits [127:112] of the 
result.
    +/// \param __w06
    +///    A 16-bit integral value used to initialize bits [111:96] of the 
result.
    +/// \param __w05
    +///    A 16-bit integral value used to initialize bits [95:80] of the 
result.
    +/// \param __w04
    +///    A 16-bit integral value used to initialize bits [79:64] of the 
result.
    +/// \param __w03
    +///    A 16-bit integral value used to initialize bits [63:48] of the 
result.
    +/// \param __w02
    +///    A 16-bit integral value used to initialize bits [47:32] of the 
result.
    +/// \param __w01
    +///    A 16-bit integral value used to initialize bits [31:16] of the 
result.
    +/// \param __w00
    +///    A 16-bit integral value used to initialize bits [15:0] of the 
result.
    +/// \returns An initialized 256-bit integer vector.
    +static __inline __m256i __DEFAULT_FN_ATTRS
    +_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
    +                 short __w11, short __w10, short __w09, short __w08,
    +                 short __w07, short __w06, short __w05, short __w04,
    +                 short __w03, short __w02, short __w01, short __w00)
    +{
    +  return __extension__ (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, 
__w04, __w05, __w06,
    +    __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
    +}
    +
    +/// Constructs a 256-bit integer vector initialized with the specified
    +///    8-bit integral values.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic is a utility function and does not correspond to a 
specific
    +///   instruction.
    +///
    +/// \param __b31
    +///    An 8-bit integral value used to initialize bits [255:248] of the 
result.
    +/// \param __b30
    +///    An 8-bit integral value used to initialize bits [247:240] of the 
result.
    +/// \param __b29
    +///    An 8-bit integral value used to initialize bits [239:232] of the 
result.
    +/// \param __b28
    +///    An 8-bit integral value used to initialize bits [231:224] of the 
result.
    +/// \param __b27
    +///    An 8-bit integral value used to initialize bits [223:216] of the 
result.
    +/// \param __b26
    +///    An 8-bit integral value used to initialize bits [215:208] of the 
result.
    +/// \param __b25
    +///    An 8-bit integral value used to initialize bits [207:200] of the 
result.
    +/// \param __b24
    +///    An 8-bit integral value used to initialize bits [199:192] of the 
result.
    +/// \param __b23
    +///    An 8-bit integral value used to initialize bits [191:184] of the 
result.
    +/// \param __b22
    +///    An 8-bit integral value used to initialize bits [183:176] of the 
result.
    +/// \param __b21
    +///    An 8-bit integral value used to initialize bits [175:168] of the 
result.
    +/// \param __b20
    +///    An 8-bit integral value used to initialize bits [167:160] of the 
result.
    +/// \param __b19
    +///    An 8-bit integral value used to initialize bits [159:152] of the 
result.
    +/// \param __b18
    +///    An 8-bit integral value used to initialize bits [151:144] of the 
result.
    +/// \param __b17
    +///    An 8-bit integral value used to initialize bits [143:136] of the 
result.
    +/// \param __b16
    +///    An 8-bit integral value used to initialize bits [135:128] of the 
result.
    +/// \param __b15
    +///    An 8-bit integral value used to initialize bits [127:120] of the 
result.
    +/// \param __b14
    +///    An 8-bit integral value used to initialize bits [119:112] of the 
result.
    +/// \param __b13
    +///    An 8-bit integral value used to initialize bits [111:104] of the 
result.
    +/// \param __b12
    +///    An 8-bit integral value used to initialize bits [103:96] of the 
result.
    +/// \param __b11
    +///    An 8-bit integral value used to initialize bits [95:88] of the 
result.
    +/// \param __b10
    +///    An 8-bit integral value used to initialize bits [87:80] of the 
result.
    +/// \param __b09
    +///    An 8-bit integral value used to initialize bits [79:72] of the 
result.
    +/// \param __b08
    +///    An 8-bit integral value used to initialize bits [71:64] of the 
result.
    +/// \param __b07
    +///    An 8-bit integral value used to initialize bits [63:56] of the 
result.
    +/// \param __b06
    +///    An 8-bit integral value used to initialize bits [55:48] of the 
result.
    +/// \param __b05
    +///    An 8-bit integral value used to initialize bits [47:40] of the 
result.
    +/// \param __b04
    +///    An 8-bit integral value used to initialize bits [39:32] of the 
result.
    +/// \param __b03
    +///    An 8-bit integral value used to initialize bits [31:24] of the 
result.
    +/// \param __b02
    +///    An 8-bit integral value used to initialize bits [23:16] of the 
result.
    +/// \param __b01
    +///    An 8-bit integral value used to initialize bits [15:8] of the 
result.
    +/// \param __b00
    +///    An 8-bit integral value used to initialize bits [7:0] of the result.
    +/// \returns An initialized 256-bit integer vector.
    +static __inline __m256i __DEFAULT_FN_ATTRS
    +_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
    +                char __b27, char __b26, char __b25, char __b24,
    +                char __b23, char __b22, char __b21, char __b20,
    +                char __b19, char __b18, char __b17, char __b16,
    +                char __b15, char __b14, char __b13, char __b12,
    +                char __b11, char __b10, char __b09, char __b08,
    +                char __b07, char __b06, char __b05, char __b04,
    +                char __b03, char __b02, char __b01, char __b00)
    +{
    +  return __extension__ (__m256i)(__v32qi){
    +    __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
    +    __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
    +    __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
    +    __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
    +  };
    +}
    +
    +/// Constructs a 256-bit integer vector initialized with the specified
    +///    64-bit integral values.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
    +///   instruction.
    +///
    +/// \param __a
    +///    A 64-bit integral value used to initialize bits [255:192] of the 
result.
    +/// \param __b
    +///    A 64-bit integral value used to initialize bits [191:128] of the 
result.
    +/// \param __c
    +///    A 64-bit integral value used to initialize bits [127:64] of the 
result.
    +/// \param __d
    +///    A 64-bit integral value used to initialize bits [63:0] of the 
result.
    +/// \returns An initialized 256-bit integer vector.
    +static __inline __m256i __DEFAULT_FN_ATTRS
    +_mm256_set_epi64x(long long __a, long long __b, long long __c, long long 
__d)
    +{
    +  return __extension__ (__m256i)(__v4di){ __d, __c, __b, __a };
    +}
    +
    +/* Create vectors with elements in reverse order */
    +/// Constructs a 256-bit floating-point vector of [4 x double],
    +///    initialized in reverse order with the specified double-precision
    +///    floating-point values.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
    +///   instruction.
    +///
    +/// \param __a
    +///    A double-precision floating-point value used to initialize bits 
[63:0]
    +///    of the result.
    +/// \param __b
    +///    A double-precision floating-point value used to initialize bits 
[127:64]
    +///    of the result.
    +/// \param __c
    +///    A double-precision floating-point value used to initialize bits 
[191:128]
    +///    of the result.
    +/// \param __d
    +///    A double-precision floating-point value used to initialize bits 
[255:192]
    +///    of the result.
    +/// \returns An initialized 256-bit floating-point vector of [4 x double].
    +static __inline __m256d __DEFAULT_FN_ATTRS
    +_mm256_setr_pd(double __a, double __b, double __c, double __d)
    +{
    +  return _mm256_set_pd(__d, __c, __b, __a);
    +}
    +
    +/// Constructs a 256-bit floating-point vector of [8 x float],
    +///    initialized in reverse order with the specified single-precision
    +///    float-point values.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic is a utility function and does not correspond to a 
specific
    +///   instruction.
    +///
    +/// \param __a
    +///    A single-precision floating-point value used to initialize bits 
[31:0]
    +///    of the result.
    +/// \param __b
    +///    A single-precision floating-point value used to initialize bits 
[63:32]
    +///    of the result.
    +/// \param __c
    +///    A single-precision floating-point value used to initialize bits 
[95:64]
    +///    of the result.
    +/// \param __d
    +///    A single-precision floating-point value used to initialize bits 
[127:96]
    +///    of the result.
    +/// \param __e
    +///    A single-precision floating-point value used to initialize bits 
[159:128]
    +///    of the result.
    +/// \param __f
    +///    A single-precision floating-point value used to initialize bits 
[191:160]
    +///    of the result.
    +/// \param __g
    +///    A single-precision floating-point value used to initialize bits 
[223:192]
    +///    of the result.
    +/// \param __h
    +///    A single-precision floating-point value used to initialize bits 
[255:224]
    +///    of the result.
    +/// \returns An initialized 256-bit floating-point vector of [8 x float].
    +static __inline __m256 __DEFAULT_FN_ATTRS
    +_mm256_setr_ps(float __a, float __b, float __c, float __d,
    +               float __e, float __f, float __g, float __h)
    +{
    +  return _mm256_set_ps(__h, __g, __f, __e, __d, __c, __b, __a);
    +}
    +
    +/// Constructs a 256-bit integer vector, initialized in reverse order
    +///    with the specified 32-bit integral values.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic is a utility function and does not correspond to a 
specific
    +///   instruction.
    +///
    +/// \param __i0
    +///    A 32-bit integral value used to initialize bits [31:0] of the 
result.
    +/// \param __i1
    +///    A 32-bit integral value used to initialize bits [63:32] of the 
result.
    +/// \param __i2
    +///    A 32-bit integral value used to initialize bits [95:64] of the 
result.
    +/// \param __i3
    +///    A 32-bit integral value used to initialize bits [127:96] of the 
result.
    +/// \param __i4
    +///    A 32-bit integral value used to initialize bits [159:128] of the 
result.
    +/// \param __i5
    +///    A 32-bit integral value used to initialize bits [191:160] of the 
result.
    +/// \param __i6
    +///    A 32-bit integral value used to initialize bits [223:192] of the 
result.
    +/// \param __i7
    +///    A 32-bit integral value used to initialize bits [255:224] of the 
result.
    +/// \returns An initialized 256-bit integer vector.
    +static __inline __m256i __DEFAULT_FN_ATTRS
    +_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
    +                  int __i4, int __i5, int __i6, int __i7)
    +{
    +  return _mm256_set_epi32(__i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0);
    +}
    +
    +/// Constructs a 256-bit integer vector, initialized in reverse order
    +///    with the specified 16-bit integral values.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic is a utility function and does not correspond to a 
specific
    +///   instruction.
    +///
    +/// \param __w15
    +///    A 16-bit integral value used to initialize bits [15:0] of the 
result.
    +/// \param __w14
    +///    A 16-bit integral value used to initialize bits [31:16] of the 
result.
    +/// \param __w13
    +///    A 16-bit integral value used to initialize bits [47:32] of the 
result.
    +/// \param __w12
    +///    A 16-bit integral value used to initialize bits [63:48] of the 
result.
    +/// \param __w11
    +///    A 16-bit integral value used to initialize bits [79:64] of the 
result.
    +/// \param __w10
    +///    A 16-bit integral value used to initialize bits [95:80] of the 
result.
    +/// \param __w09
    +///    A 16-bit integral value used to initialize bits [111:96] of the 
result.
    +/// \param __w08
    +///    A 16-bit integral value used to initialize bits [127:112] of the 
result.
    +/// \param __w07
    +///    A 16-bit integral value used to initialize bits [143:128] of the 
result.
    +/// \param __w06
    +///    A 16-bit integral value used to initialize bits [159:144] of the 
result.
    +/// \param __w05
    +///    A 16-bit integral value used to initialize bits [175:160] of the 
result.
    +/// \param __w04
    +///    A 16-bit integral value used to initialize bits [191:176] of the 
result.
    +/// \param __w03
    +///    A 16-bit integral value used to initialize bits [207:192] of the 
result.
    +/// \param __w02
    +///    A 16-bit integral value used to initialize bits [223:208] of the 
result.
    +/// \param __w01
    +///    A 16-bit integral value used to initialize bits [239:224] of the 
result.
    +/// \param __w00
    +///    A 16-bit integral value used to initialize bits [255:240] of the 
result.
    +/// \returns An initialized 256-bit integer vector.
    +static __inline __m256i __DEFAULT_FN_ATTRS
    +_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
    +       short __w11, short __w10, short __w09, short __w08,
    +       short __w07, short __w06, short __w05, short __w04,
    +       short __w03, short __w02, short __w01, short __w00)
    +{
    +  return _mm256_set_epi16(__w00, __w01, __w02, __w03,
    +                          __w04, __w05, __w06, __w07,
    +                          __w08, __w09, __w10, __w11,
    +                          __w12, __w13, __w14, __w15);
    +}
    +
    +/// Constructs a 256-bit integer vector, initialized in reverse order
    +///    with the specified 8-bit integral values.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic is a utility function and does not correspond to a 
specific
    +///   instruction.
    +///
    +/// \param __b31
    +///    An 8-bit integral value used to initialize bits [7:0] of the result.
    +/// \param __b30
    +///    An 8-bit integral value used to initialize bits [15:8] of the 
result.
    +/// \param __b29
    +///    An 8-bit integral value used to initialize bits [23:16] of the 
result.
    +/// \param __b28
    +///    An 8-bit integral value used to initialize bits [31:24] of the 
result.
    +/// \param __b27
    +///    An 8-bit integral value used to initialize bits [39:32] of the 
result.
    +/// \param __b26
    +///    An 8-bit integral value used to initialize bits [47:40] of the 
result.
    +/// \param __b25
    +///    An 8-bit integral value used to initialize bits [55:48] of the 
result.
    +/// \param __b24
    +///    An 8-bit integral value used to initialize bits [63:56] of the 
result.
    +/// \param __b23
    +///    An 8-bit integral value used to initialize bits [71:64] of the 
result.
    +/// \param __b22
    +///    An 8-bit integral value used to initialize bits [79:72] of the 
result.
    +/// \param __b21
    +///    An 8-bit integral value used to initialize bits [87:80] of the 
result.
    +/// \param __b20
    +///    An 8-bit integral value used to initialize bits [95:88] of the 
result.
    +/// \param __b19
    +///    An 8-bit integral value used to initialize bits [103:96] of the 
result.
    +/// \param __b18
    +///    An 8-bit integral value used to initialize bits [111:104] of the 
result.
    +/// \param __b17
    +///    An 8-bit integral value used to initialize bits [119:112] of the 
result.
    +/// \param __b16
    +///    An 8-bit integral value used to initialize bits [127:120] of the 
result.
    +/// \param __b15
    +///    An 8-bit integral value used to initialize bits [135:128] of the 
result.
    +/// \param __b14
    +///    An 8-bit integral value used to initialize bits [143:136] of the 
result.
    +/// \param __b13
    +///    An 8-bit integral value used to initialize bits [151:144] of the 
result.
    +/// \param __b12
    +///    An 8-bit integral value used to initialize bits [159:152] of the 
result.
    +/// \param __b11
    +///    An 8-bit integral value used to initialize bits [167:160] of the 
result.
    +/// \param __b10
    +///    An 8-bit integral value used to initialize bits [175:168] of the 
result.
    +/// \param __b09
    +///    An 8-bit integral value used to initialize bits [183:176] of the 
result.
    +/// \param __b08
    +///    An 8-bit integral value used to initialize bits [191:184] of the 
result.
    +/// \param __b07
    +///    An 8-bit integral value used to initialize bits [199:192] of the 
result.
    +/// \param __b06
    +///    An 8-bit integral value used to initialize bits [207:200] of the 
result.
    +/// \param __b05
    +///    An 8-bit integral value used to initialize bits [215:208] of the 
result.
    +/// \param __b04
    +///    An 8-bit integral value used to initialize bits [223:216] of the 
result.
    +/// \param __b03
    +///    An 8-bit integral value used to initialize bits [231:224] of the 
result.
    +/// \param __b02
    +///    An 8-bit integral value used to initialize bits [239:232] of the 
result.
    +/// \param __b01
    +///    An 8-bit integral value used to initialize bits [247:240] of the 
result.
    +/// \param __b00
    +///    An 8-bit integral value used to initialize bits [255:248] of the 
result.
    +/// \returns An initialized 256-bit integer vector.
    +static __inline __m256i __DEFAULT_FN_ATTRS
    +_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
    +                 char __b27, char __b26, char __b25, char __b24,
    +                 char __b23, char __b22, char __b21, char __b20,
    +                 char __b19, char __b18, char __b17, char __b16,
    +                 char __b15, char __b14, char __b13, char __b12,
    +                 char __b11, char __b10, char __b09, char __b08,
    +                 char __b07, char __b06, char __b05, char __b04,
    +                 char __b03, char __b02, char __b01, char __b00)
    +{
    +  return _mm256_set_epi8(__b00, __b01, __b02, __b03, __b04, __b05, __b06, 
__b07,
    +                         __b08, __b09, __b10, __b11, __b12, __b13, __b14, 
__b15,
    +                         __b16, __b17, __b18, __b19, __b20, __b21, __b22, 
__b23,
    +                         __b24, __b25, __b26, __b27, __b28, __b29, __b30, 
__b31);
    +}
    +
    +/// Constructs a 256-bit integer vector, initialized in reverse order
    +///    with the specified 64-bit integral values.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
    +///   instruction.
    +///
    +/// \param __a
    +///    A 64-bit integral value used to initialize bits [63:0] of the 
result.
    +/// \param __b
    +///    A 64-bit integral value used to initialize bits [127:64] of the 
result.
    +/// \param __c
    +///    A 64-bit integral value used to initialize bits [191:128] of the 
result.
    +/// \param __d
    +///    A 64-bit integral value used to initialize bits [255:192] of the 
result.
    +/// \returns An initialized 256-bit integer vector.
    +static __inline __m256i __DEFAULT_FN_ATTRS
    +_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long 
__d)
    +{
    +  return _mm256_set_epi64x(__d, __c, __b, __a);
    +}
    +
    +/* Create vectors with repeated elements */
    +/// Constructs a 256-bit floating-point vector of [4 x double], with each
    +///    of the four double-precision floating-point vector elements set to 
the
    +///    specified double-precision floating-point value.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> 
instruction.
    +///
    +/// \param __w
    +///    A double-precision floating-point value used to initialize each 
vector
    +///    element of the result.
    +/// \returns An initialized 256-bit floating-point vector of [4 x double].
    +static __inline __m256d __DEFAULT_FN_ATTRS
    +_mm256_set1_pd(double __w)
    +{
    +  return _mm256_set_pd(__w, __w, __w, __w);
    +}
    +
    +/// Constructs a 256-bit floating-point vector of [8 x float], with each
    +///    of the eight single-precision floating-point vector elements set to 
the
    +///    specified single-precision floating-point value.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
    +///   instruction.
    +///
    +/// \param __w
    +///    A single-precision floating-point value used to initialize each 
vector
    +///    element of the result.
    +/// \returns An initialized 256-bit floating-point vector of [8 x float].
    +static __inline __m256 __DEFAULT_FN_ATTRS
    +_mm256_set1_ps(float __w)
    +{
    +  return _mm256_set_ps(__w, __w, __w, __w, __w, __w, __w, __w);
    +}
    +
    +/// Constructs a 256-bit integer vector of [8 x i32], with each of the
    +///    32-bit integral vector elements set to the specified 32-bit integral
    +///    value.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
    +///   instruction.
    +///
    +/// \param __i
    +///    A 32-bit integral value used to initialize each vector element of 
the
    +///    result.
    +/// \returns An initialized 256-bit integer vector of [8 x i32].
    +static __inline __m256i __DEFAULT_FN_ATTRS
    +_mm256_set1_epi32(int __i)
    +{
    +  return _mm256_set_epi32(__i, __i, __i, __i, __i, __i, __i, __i);
    +}
    +
    +/// Constructs a 256-bit integer vector of [16 x i16], with each of the
    +///    16-bit integral vector elements set to the specified 16-bit integral
    +///    value.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> 
instruction.
    +///
    +/// \param __w
    +///    A 16-bit integral value used to initialize each vector element of 
the
    +///    result.
    +/// \returns An initialized 256-bit integer vector of [16 x i16].
    +static __inline __m256i __DEFAULT_FN_ATTRS
    +_mm256_set1_epi16(short __w)
    +{
    +  return _mm256_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w,
    +                          __w, __w, __w, __w, __w, __w, __w, __w);
    +}
    +
    +/// Constructs a 256-bit integer vector of [32 x i8], with each of the
    +///    8-bit integral vector elements set to the specified 8-bit integral 
value.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> 
instruction.
    +///
    +/// \param __b
    +///    An 8-bit integral value used to initialize each vector element of 
the
    +///    result.
    +/// \returns An initialized 256-bit integer vector of [32 x i8].
    +static __inline __m256i __DEFAULT_FN_ATTRS
    +_mm256_set1_epi8(char __b)
    +{
    +  return _mm256_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b,
    +                         __b, __b, __b, __b, __b, __b, __b, __b,
    +                         __b, __b, __b, __b, __b, __b, __b, __b,
    +                         __b, __b, __b, __b, __b, __b, __b, __b);
    +}
    +
    +/// Constructs a 256-bit integer vector of [4 x i64], with each of the
    +///    64-bit integral vector elements set to the specified 64-bit integral
    +///    value.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> 
instruction.
    +///
    +/// \param __q
    +///    A 64-bit integral value used to initialize each vector element of 
the
    +///    result.
    +/// \returns An initialized 256-bit integer vector of [4 x i64].
    +static __inline __m256i __DEFAULT_FN_ATTRS
    +_mm256_set1_epi64x(long long __q)
    +{
    +  return _mm256_set_epi64x(__q, __q, __q, __q);
    +}
    +
    +/* Create __zeroed vectors */
    +/// Constructs a 256-bit floating-point vector of [4 x double] with all
    +///    vector elements initialized to zero.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
    +///
    +/// \returns A 256-bit vector of [4 x double] with all elements set to 
zero.
    +static __inline __m256d __DEFAULT_FN_ATTRS
    +_mm256_setzero_pd(void)
    +{
    +  return __extension__ (__m256d){ 0, 0, 0, 0 };
    +}
    +
    +/// Constructs a 256-bit floating-point vector of [8 x float] with all
    +///    vector elements initialized to zero.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
    +///
    +/// \returns A 256-bit vector of [8 x float] with all elements set to zero.
    +static __inline __m256 __DEFAULT_FN_ATTRS
    +_mm256_setzero_ps(void)
    +{
    +  return __extension__ (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
    +}
    +
    +/// Constructs a 256-bit integer vector initialized to zero.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
    +///
    +/// \returns A 256-bit integer vector initialized to zero.
    +static __inline __m256i __DEFAULT_FN_ATTRS
    +_mm256_setzero_si256(void)
    +{
    +  return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
    +}
    +
    +/* Cast between vector types */
    +/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
    +///    floating-point vector of [8 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic has no corresponding instruction.
    +///
    +/// \param __a
    +///    A 256-bit floating-point vector of [4 x double].
    +/// \returns A 256-bit floating-point vector of [8 x float] containing the 
same
    +///    bitwise pattern as the parameter.
    +static __inline __m256 __DEFAULT_FN_ATTRS
    +_mm256_castpd_ps(__m256d __a)
    +{
    +  return (__m256)__a;
    +}
    +
    +/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
    +///    integer vector.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic has no corresponding instruction.
    +///
    +/// \param __a
    +///    A 256-bit floating-point vector of [4 x double].
    +/// \returns A 256-bit integer vector containing the same bitwise pattern 
as the
    +///    parameter.
    +static __inline __m256i __DEFAULT_FN_ATTRS
    +_mm256_castpd_si256(__m256d __a)
    +{
    +  return (__m256i)__a;
    +}
    +
    +/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
    +///    floating-point vector of [4 x double].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic has no corresponding instruction.
    +///
    +/// \param __a
    +///    A 256-bit floating-point vector of [8 x float].
    +/// \returns A 256-bit floating-point vector of [4 x double] containing 
the same
    +///    bitwise pattern as the parameter.
    +static __inline __m256d __DEFAULT_FN_ATTRS
    +_mm256_castps_pd(__m256 __a)
    +{
    +  return (__m256d)__a;
    +}
    +
    +/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
    +///    integer vector.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic has no corresponding instruction.
    +///
    +/// \param __a
    +///    A 256-bit floating-point vector of [8 x float].
    +/// \returns A 256-bit integer vector containing the same bitwise pattern 
as the
    +///    parameter.
    +static __inline __m256i __DEFAULT_FN_ATTRS
    +_mm256_castps_si256(__m256 __a)
    +{
    +  return (__m256i)__a;
    +}
    +
    +/// Casts a 256-bit integer vector into a 256-bit floating-point vector
    +///    of [8 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic has no corresponding instruction.
    +///
    +/// \param __a
    +///    A 256-bit integer vector.
    +/// \returns A 256-bit floating-point vector of [8 x float] containing the 
same
    +///    bitwise pattern as the parameter.
    +static __inline __m256 __DEFAULT_FN_ATTRS
    +_mm256_castsi256_ps(__m256i __a)
    +{
    +  return (__m256)__a;
    +}
    +
    +/// Casts a 256-bit integer vector into a 256-bit floating-point vector
    +///    of [4 x double].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic has no corresponding instruction.
    +///
    +/// \param __a
    +///    A 256-bit integer vector.
    +/// \returns A 256-bit floating-point vector of [4 x double] containing 
the same
    +///    bitwise pattern as the parameter.
    +static __inline __m256d __DEFAULT_FN_ATTRS
    +_mm256_castsi256_pd(__m256i __a)
    +{
    +  return (__m256d)__a;
    +}
    +
    +/// Returns the lower 128 bits of a 256-bit floating-point vector of
    +///    [4 x double] as a 128-bit floating-point vector of [2 x double].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic has no corresponding instruction.
    +///
    +/// \param __a
    +///    A 256-bit floating-point vector of [4 x double].
    +/// \returns A 128-bit floating-point vector of [2 x double] containing the
    +///    lower 128 bits of the parameter.
    +static __inline __m128d __DEFAULT_FN_ATTRS
    +_mm256_castpd256_pd128(__m256d __a)
    +{
    +#ifdef __GNUC__
    +  return (__m128d) __builtin_ia32_pd_pd256 ((__v4df)__a);
    +#else
    +  return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1);
    +#endif
    +}
    +
    +/// Returns the lower 128 bits of a 256-bit floating-point vector of
    +///    [8 x float] as a 128-bit floating-point vector of [4 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic has no corresponding instruction.
    +///
    +/// \param __a
    +///    A 256-bit floating-point vector of [8 x float].
    +/// \returns A 128-bit floating-point vector of [4 x float] containing the
    +///    lower 128 bits of the parameter.
    +static __inline __m128 __DEFAULT_FN_ATTRS
    +_mm256_castps256_ps128(__m256 __a)
    +{
    +#ifdef __GNUC__
    +  return (__m128) __builtin_ia32_ps_ps256 ((__v8sf)__a);
    +#else
    +  return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3);
    +#endif
    +}
    +
    +/// Truncates a 256-bit integer vector into a 128-bit integer vector.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic has no corresponding instruction.
    +///
    +/// \param __a
    +///    A 256-bit integer vector.
    +/// \returns A 128-bit integer vector containing the lower 128 bits of the
    +///    parameter.
    +static __inline __m128i __DEFAULT_FN_ATTRS
    +_mm256_castsi256_si128(__m256i __a)
    +{
    +#ifdef __GNUC__
    +  return (__m128i) __builtin_ia32_si_si256 ((__v8si)__a);
    +#else
    +  return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1);
    +#endif
    +}
    +
    +/// Constructs a 256-bit floating-point vector of [4 x double] from a
    +///    128-bit floating-point vector of [2 x double].
    +///
    +///    The lower 128 bits contain the value of the source vector. The 
contents
    +///    of the upper 128 bits are undefined.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic has no corresponding instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double].
    +/// \returns A 256-bit floating-point vector of [4 x double]. The lower 
128 bits
    +///    contain the value of the parameter. The contents of the upper 128 
bits
    +///    are undefined.
    +static __inline __m256d __DEFAULT_FN_ATTRS
    +_mm256_castpd128_pd256(__m128d __a)
    +{
    +#ifdef __GNUC__
    +  return (__m256d) __builtin_ia32_pd256_pd ((__v2df)__a);
    +#else
    +  return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 1, -1, -1);
    +#endif
    +}
    +
    +/// Constructs a 256-bit floating-point vector of [8 x float] from a
    +///    128-bit floating-point vector of [4 x float].
    +///
    +///    The lower 128 bits contain the value of the source vector. The 
contents
    +///    of the upper 128 bits are undefined.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic has no corresponding instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float].
    +/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 
bits
    +///    contain the value of the parameter. The contents of the upper 128 
bits
    +///    are undefined.
    +static __inline __m256 __DEFAULT_FN_ATTRS
    +_mm256_castps128_ps256(__m128 __a)
    +{
    +#ifdef __GNUC__
    +  return (__m256) __builtin_ia32_ps256_ps ((__v4sf)__a);
    +#else
    +  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1, 2, 3, -1, 
-1, -1, -1);
    +#endif
    +}
    +
    +/// Constructs a 256-bit integer vector from a 128-bit integer vector.
    +///
    +///    The lower 128 bits contain the value of the source vector. The 
contents
    +///    of the upper 128 bits are undefined.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic has no corresponding instruction.
    +///
    +/// \param __a
    +///    A 128-bit integer vector.
    +/// \returns A 256-bit integer vector. The lower 128 bits contain the 
value of
    +///    the parameter. The contents of the upper 128 bits are undefined.
    +static __inline __m256i __DEFAULT_FN_ATTRS
    +_mm256_castsi128_si256(__m128i __a)
    +{
    +#ifdef __GNUC__
    +  return (__m256i) __builtin_ia32_si256_si ((__v4si)__a);
    +#else
    +  return __builtin_shufflevector((__v2di)__a, (__v2di)__a, 0, 1, -1, -1);
    +#endif
    +}
    +
    +/*
    +   Vector insert.
    +   We use macros rather than inlines because we only want to accept
    +   invocations where the immediate M is a constant expression.
    +*/
    +/// Constructs a new 256-bit vector of [8 x float] by first duplicating
    +///    a 256-bit vector of [8 x float] given in the first parameter, and 
then
    +///    replacing either the upper or the lower 128 bits with the contents 
of a
    +///    128-bit vector of [4 x float] in the second parameter.
    +///
    +///    The immediate integer parameter determines between the upper or the 
lower
    +///    128 bits.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
    +///
    +/// \param V1
    +///    A 256-bit vector of [8 x float]. This vector is copied to the result
    +///    first, and then either the upper or the lower 128 bits of the 
result will
    +///    be replaced by the contents of \a V2.
    +/// \param V2
    +///    A 128-bit vector of [4 x float]. The contents of this parameter are
    +///    written to either the upper or the lower 128 bits of the result 
depending
    +///    on the value of parameter \a M.
    +/// \param M
    +///    An immediate integer. The least significant bit determines how the 
values
    +///    from the two parameters are interleaved: \n
    +///    If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the 
result,
    +///    and bits [255:128] of \a V1 are copied to bits [255:128] of the
    +///    result. \n
    +///    If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
    +///    result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
    +///    result.
    +/// \returns A 256-bit vector of [8 x float] containing the interleaved 
values.
    +#define _mm256_insertf128_ps(V1, V2, M) \
    +  (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)(__m256)(V1), \
    +                                           (__v4sf)(__m128)(V2), (int)(M))
    +
    +/// Constructs a new 256-bit vector of [4 x double] by first duplicating
    +///    a 256-bit vector of [4 x double] given in the first parameter, and 
then
    +///    replacing either the upper or the lower 128 bits with the contents 
of a
    +///    128-bit vector of [2 x double] in the second parameter.
    +///
    +///    The immediate integer parameter determines between the upper or the 
lower
    +///    128 bits.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
    +///
    +/// \param V1
    +///    A 256-bit vector of [4 x double]. This vector is copied to the 
result
    +///    first, and then either the upper or the lower 128 bits of the 
result will
    +///    be replaced by the contents of \a V2.
    +/// \param V2
    +///    A 128-bit vector of [2 x double]. The contents of this parameter are
    +///    written to either the upper or the lower 128 bits of the result 
depending
    +///    on the value of parameter \a M.
    +/// \param M
    +///    An immediate integer. The least significant bit determines how the 
values
    +///    from the two parameters are interleaved: \n
    +///    If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the 
result,
    +///    and bits [255:128] of \a V1 are copied to bits [255:128] of the
    +///    result. \n
    +///    If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
    +///    result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
    +///    result.
    +/// \returns A 256-bit vector of [4 x double] containing the interleaved 
values.
    +#define _mm256_insertf128_pd(V1, V2, M) \
    +  (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)(__m256d)(V1), \
    +                                            (__v2df)(__m128d)(V2), 
(int)(M))
    +
    +/// Constructs a new 256-bit integer vector by first duplicating a
    +///    256-bit integer vector given in the first parameter, and then 
replacing
    +///    either the upper or the lower 128 bits with the contents of a 
128-bit
    +///    integer vector in the second parameter.
    +///
    +///    The immediate integer parameter determines between the upper or the 
lower
    +///    128 bits.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
    +///
    +/// \param V1
    +///    A 256-bit integer vector. This vector is copied to the result 
first, and
    +///    then either the upper or the lower 128 bits of the result will be
    +///    replaced by the contents of \a V2.
    +/// \param V2
    +///    A 128-bit integer vector. The contents of this parameter are 
written to
    +///    either the upper or the lower 128 bits of the result depending on 
the
    +///     value of parameter \a M.
    +/// \param M
    +///    An immediate integer. The least significant bit determines how the 
values
    +///    from the two parameters are interleaved: \n
    +///    If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the 
result,
    +///    and bits [255:128] of \a V1 are copied to bits [255:128] of the
    +///    result. \n
    +///    If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
    +///    result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
    +///    result.
    +/// \returns A 256-bit integer vector containing the interleaved values.
    +#define _mm256_insertf128_si256(V1, V2, M) \
    +  (__m256i)__builtin_ia32_vinsertf128_si256((__v8si)(__m256i)(V1), \
    +                                            (__v4si)(__m128i)(V2), 
(int)(M))
    +
    +/*
    +   Vector extract.
    +   We use macros rather than inlines because we only want to accept
    +   invocations where the immediate M is a constant expression.
    +*/
    +/// Extracts either the upper or the lower 128 bits from a 256-bit vector
    +///    of [8 x float], as determined by the immediate integer parameter, 
and
    +///    returns the extracted bits as a 128-bit vector of [4 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m128 _mm256_extractf128_ps(__m256 V, const int M);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
    +///
    +/// \param V
    +///    A 256-bit vector of [8 x float].
    +/// \param M
    +///    An immediate integer. The least significant bit determines which 
bits are
    +///    extracted from the first parameter: \n
    +///    If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
    +///    result. \n
    +///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the 
result.
    +/// \returns A 128-bit vector of [4 x float] containing the extracted bits.
    +#define _mm256_extractf128_ps(V, M) \
    +  (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)(__m256)(V), (int)(M))
    +
    +/// Extracts either the upper or the lower 128 bits from a 256-bit vector
    +///    of [4 x double], as determined by the immediate integer parameter, 
and
    +///    returns the extracted bits as a 128-bit vector of [2 x double].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m128d _mm256_extractf128_pd(__m256d V, const int M);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
    +///
    +/// \param V
    +///    A 256-bit vector of [4 x double].
    +/// \param M
    +///    An immediate integer. The least significant bit determines which 
bits are
    +///    extracted from the first parameter: \n
    +///    If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
    +///    result. \n
    +///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the 
result.
    +/// \returns A 128-bit vector of [2 x double] containing the extracted 
bits.
    +#define _mm256_extractf128_pd(V, M) \
    +  (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)(__m256d)(V), 
(int)(M))
    +
    +/// Extracts either the upper or the lower 128 bits from a 256-bit
    +///    integer vector, as determined by the immediate integer parameter, 
and
    +///    returns the extracted bits as a 128-bit integer vector.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m128i _mm256_extractf128_si256(__m256i V, const int M);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
    +///
    +/// \param V
    +///    A 256-bit integer vector.
    +/// \param M
    +///    An immediate integer. The least significant bit determines which 
bits are
    +///    extracted from the first parameter:  \n
    +///    If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
    +///    result. \n
    +///    If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the 
result.
    +/// \returns A 128-bit integer vector containing the extracted bits.
    +#define _mm256_extractf128_si256(V, M) \
    +  (__m128i)__builtin_ia32_vextractf128_si256((__v8si)(__m256i)(V), 
(int)(M))
    +
    +/* SIMD load ops (unaligned) */
    +/// Loads two 128-bit floating-point vectors of [4 x float] from
    +///    unaligned memory locations and constructs a 256-bit floating-point 
vector
    +///    of [8 x float] by concatenating the two 128-bit vectors.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to load instructions followed by the
    +///   <c> VINSERTF128 </c> instruction.
    +///
    +/// \param __addr_hi
    +///    A pointer to a 128-bit memory location containing 4 consecutive
    +///    single-precision floating-point values. These values are to be 
copied to
    +///    bits[255:128] of the result. The address of the memory location 
does not
    +///    have to be aligned.
    +/// \param __addr_lo
    +///    A pointer to a 128-bit memory location containing 4 consecutive
    +///    single-precision floating-point values. These values are to be 
copied to
    +///    bits[127:0] of the result. The address of the memory location does 
not
    +///    have to be aligned.
    +/// \returns A 256-bit floating-point vector of [8 x float] containing the
    +///    concatenated result.
    +static __inline __m256 __DEFAULT_FN_ATTRS
    +_mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
    +{
    +  __m256 __v256 = _mm256_castps128_ps256(_mm_loadu_ps(__addr_lo));
    +  return _mm256_insertf128_ps(__v256, _mm_loadu_ps(__addr_hi), 1);
    +}
    +
    +/// Loads two 128-bit floating-point vectors of [2 x double] from
    +///    unaligned memory locations and constructs a 256-bit floating-point 
vector
    +///    of [4 x double] by concatenating the two 128-bit vectors.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to load instructions followed by the
    +///   <c> VINSERTF128 </c> instruction.
    +///
    +/// \param __addr_hi
    +///    A pointer to a 128-bit memory location containing two consecutive
    +///    double-precision floating-point values. These values are to be 
copied to
    +///    bits[255:128] of the result. The address of the memory location 
does not
    +///    have to be aligned.
    +/// \param __addr_lo
    +///    A pointer to a 128-bit memory location containing two consecutive
    +///    double-precision floating-point values. These values are to be 
copied to
    +///    bits[127:0] of the result. The address of the memory location does 
not
    +///    have to be aligned.
    +/// \returns A 256-bit floating-point vector of [4 x double] containing the
    +///    concatenated result.
    +static __inline __m256d __DEFAULT_FN_ATTRS
    +_mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
    +{
    +  __m256d __v256 = _mm256_castpd128_pd256(_mm_loadu_pd(__addr_lo));
    +  return _mm256_insertf128_pd(__v256, _mm_loadu_pd(__addr_hi), 1);
    +}
    +
    +/// Loads two 128-bit integer vectors from unaligned memory locations and
    +///    constructs a 256-bit integer vector by concatenating the two 128-bit
    +///    vectors.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to load instructions followed by the
    +///   <c> VINSERTF128 </c> instruction.
    +///
    +/// \param __addr_hi
    +///    A pointer to a 128-bit memory location containing a 128-bit integer
    +///    vector. This vector is to be copied to bits[255:128] of the result. 
The
    +///    address of the memory location does not have to be aligned.
    +/// \param __addr_lo
    +///    A pointer to a 128-bit memory location containing a 128-bit integer
    +///    vector. This vector is to be copied to bits[127:0] of the result. 
The
    +///    address of the memory location does not have to be aligned.
    +/// \returns A 256-bit integer vector containing the concatenated result.
    +static __inline __m256i __DEFAULT_FN_ATTRS
    +_mm256_loadu2_m128i(__m128i const *__addr_hi, __m128i const *__addr_lo)
    +{
    +  __m256i __v256 = _mm256_castsi128_si256(_mm_loadu_si128(__addr_lo));
    +  return _mm256_insertf128_si256(__v256, _mm_loadu_si128(__addr_hi), 1);
    +}
    +
    +/* SIMD store ops (unaligned) */
    +/// Stores the upper and lower 128 bits of a 256-bit floating-point
    +///    vector of [8 x float] into two different unaligned memory locations.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction 
and the
    +///   store instructions.
    +///
    +/// \param __addr_hi
    +///    A pointer to a 128-bit memory location. Bits[255:128] of \a __a are 
to be
    +///    copied to this memory location. The address of this memory location 
does
    +///    not have to be aligned.
    +/// \param __addr_lo
    +///    A pointer to a 128-bit memory location. Bits[127:0] of \a __a are 
to be
    +///    copied to this memory location. The address of this memory location 
does
    +///    not have to be aligned.
    +/// \param __a
    +///    A 256-bit floating-point vector of [8 x float].
    +static __inline void __DEFAULT_FN_ATTRS
    +_mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
    +{
    +  __m128 __v128;
    +
    +  __v128 = _mm256_castps256_ps128(__a);
    +  _mm_storeu_ps(__addr_lo, __v128);
    +  __v128 = _mm256_extractf128_ps(__a, 1);
    +  _mm_storeu_ps(__addr_hi, __v128);
    +}
    +
    +/// Stores the upper and lower 128 bits of a 256-bit floating-point
    +///    vector of [4 x double] into two different unaligned memory 
locations.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction 
and the
    +///   store instructions.
    +///
    +/// \param __addr_hi
    +///    A pointer to a 128-bit memory location. Bits[255:128] of \a __a are 
to be
    +///    copied to this memory location. The address of this memory location 
does
    +///    not have to be aligned.
    +/// \param __addr_lo
    +///    A pointer to a 128-bit memory location. Bits[127:0] of \a __a are 
to be
    +///    copied to this memory location. The address of this memory location 
does
    +///    not have to be aligned.
    +/// \param __a
    +///    A 256-bit floating-point vector of [4 x double].
    +static __inline void __DEFAULT_FN_ATTRS
    +_mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
    +{
    +  __m128d __v128;
    +
    +  __v128 = _mm256_castpd256_pd128(__a);
    +  _mm_storeu_pd(__addr_lo, __v128);
    +  __v128 = _mm256_extractf128_pd(__a, 1);
    +  _mm_storeu_pd(__addr_hi, __v128);
    +}
    +
    +/// Stores the upper and lower 128 bits of a 256-bit integer vector into
    +///    two different unaligned memory locations.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction 
and the
    +///   store instructions.
    +///
    +/// \param __addr_hi
    +///    A pointer to a 128-bit memory location. Bits[255:128] of \a __a are 
to be
    +///    copied to this memory location. The address of this memory location 
does
    +///    not have to be aligned.
    +/// \param __addr_lo
    +///    A pointer to a 128-bit memory location. Bits[127:0] of \a __a are 
to be
    +///    copied to this memory location. The address of this memory location 
does
    +///    not have to be aligned.
    +/// \param __a
    +///    A 256-bit integer vector.
    +static __inline void __DEFAULT_FN_ATTRS
    +_mm256_storeu2_m128i(__m128i *__addr_hi, __m128i *__addr_lo, __m256i __a)
    +{
    +  __m128i __v128;
    +
    +  __v128 = _mm256_castsi256_si128(__a);
    +  _mm_storeu_si128(__addr_lo, __v128);
    +  __v128 = _mm256_extractf128_si256(__a, 1);
    +  _mm_storeu_si128(__addr_hi, __v128);
    +}
    +
    +/// Constructs a 256-bit floating-point vector of [8 x float] by
    +///    concatenating two 128-bit floating-point vectors of [4 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
    +///
    +/// \param __hi
    +///    A 128-bit floating-point vector of [4 x float] to be copied to the 
upper
    +///    128 bits of the result.
    +/// \param __lo
    +///    A 128-bit floating-point vector of [4 x float] to be copied to the 
lower
    +///    128 bits of the result.
    +/// \returns A 256-bit floating-point vector of [8 x float] containing the
    +///    concatenated result.
    +static __inline __m256 __DEFAULT_FN_ATTRS
    +_mm256_set_m128 (__m128 __hi, __m128 __lo)
    +{
    +#ifdef __GNUC__
    +  return _mm256_insertf128_ps (_mm256_castps128_ps256 (__lo), __hi, 1);
    +#else
    +  return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 
1, 2, 3, 4, 5, 6, 7);
    +#endif
    +}
    +
    +/// Constructs a 256-bit floating-point vector of [4 x double] by
    +///    concatenating two 128-bit floating-point vectors of [2 x double].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
    +///
    +/// \param __hi
    +///    A 128-bit floating-point vector of [2 x double] to be copied to the 
upper
    +///    128 bits of the result.
    +/// \param __lo
    +///    A 128-bit floating-point vector of [2 x double] to be copied to the 
lower
    +///    128 bits of the result.
    +/// \returns A 256-bit floating-point vector of [4 x double] containing the
    +///    concatenated result.
    +static __inline __m256d __DEFAULT_FN_ATTRS
    +_mm256_set_m128d (__m128d __hi, __m128d __lo)
    +{
    +#ifdef __GNUC__
    +  return (__m256d) _mm256_insertf128_pd (_mm256_castpd128_pd256 (__lo), 
__hi, 1);
    +#else
    +  return (__m256d) __builtin_shufflevector((__v2df)__lo, (__v2df)__hi, 0, 
1, 2, 3);
    +#endif
    +}
    +
    +/// Constructs a 256-bit integer vector by concatenating two 128-bit
    +///    integer vectors.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
    +///
    +/// \param __hi
    +///    A 128-bit integer vector to be copied to the upper 128 bits of the
    +///    result.
    +/// \param __lo
    +///    A 128-bit integer vector to be copied to the lower 128 bits of the
    +///    result.
    +/// \returns A 256-bit integer vector containing the concatenated result.
    +static __inline __m256i __DEFAULT_FN_ATTRS
    +_mm256_set_m128i (__m128i __hi, __m128i __lo)
    +{
    +#ifdef __GNUC__
    +  return (__m256i) _mm256_insertf128_si256 (_mm256_castsi128_si256 (__lo), 
__hi, 1);
    +#else
    +  return (__m256i) __builtin_shufflevector((__v2di)__lo, (__v2di)__hi, 0, 
1, 2, 3);
    +#endif
    +}
    +
    +/// Constructs a 256-bit floating-point vector of [8 x float] by
    +///    concatenating two 128-bit floating-point vectors of [4 x float]. 
This is
    +///    similar to _mm256_set_m128, but the order of the input parameters is
    +///    swapped.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
    +///
    +/// \param __lo
    +///    A 128-bit floating-point vector of [4 x float] to be copied to the 
lower
    +///    128 bits of the result.
    +/// \param __hi
    +///    A 128-bit floating-point vector of [4 x float] to be copied to the 
upper
    +///    128 bits of the result.
    +/// \returns A 256-bit floating-point vector of [8 x float] containing the
    +///    concatenated result.
    +static __inline __m256 __DEFAULT_FN_ATTRS
    +_mm256_setr_m128 (__m128 __lo, __m128 __hi)
    +{
    +  return _mm256_set_m128(__hi, __lo);
    +}
    +
    +/// Constructs a 256-bit floating-point vector of [4 x double] by
    +///    concatenating two 128-bit floating-point vectors of [2 x double]. 
This is
    +///    similar to _mm256_set_m128d, but the order of the input parameters 
is
    +///    swapped.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
    +///
    +/// \param __lo
    +///    A 128-bit floating-point vector of [2 x double] to be copied to the 
lower
    +///    128 bits of the result.
    +/// \param __hi
    +///    A 128-bit floating-point vector of [2 x double] to be copied to the 
upper
    +///    128 bits of the result.
    +/// \returns A 256-bit floating-point vector of [4 x double] containing the
    +///    concatenated result.
    +static __inline __m256d __DEFAULT_FN_ATTRS
    +_mm256_setr_m128d (__m128d __lo, __m128d __hi)
    +{
    +  return (__m256d)_mm256_set_m128d(__hi, __lo);
    +}
    +
    +/// Constructs a 256-bit integer vector by concatenating two 128-bit
    +///    integer vectors. This is similar to _mm256_set_m128i, but the order 
of
    +///    the input parameters is swapped.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
    +///
    +/// \param __lo
    +///    A 128-bit integer vector to be copied to the lower 128 bits of the
    +///    result.
    +/// \param __hi
    +///    A 128-bit integer vector to be copied to the upper 128 bits of the
    +///    result.
    +/// \returns A 256-bit integer vector containing the concatenated result.
    +static __inline __m256i __DEFAULT_FN_ATTRS
    +_mm256_setr_m128i (__m128i __lo, __m128i __hi)
    +{
    +  return (__m256i)_mm256_set_m128i(__hi, __lo);
    +}
    +
    +#undef __DEFAULT_FN_ATTRS
    +#undef __DEFAULT_FN_ATTRS128
    +
    +#endif /* __AVXINTRIN_H */
    diff --git a/include/emmintrin.h b/include/emmintrin.h
    new file mode 100644
    index 0000000..4569d98
    --- /dev/null
    +++ b/include/emmintrin.h
    @@ -0,0 +1,5021 @@
    +/*===---- emmintrin.h - SSE2 intrinsics 
------------------------------------===
    + *
    + * Permission is hereby granted, free of charge, to any person obtaining a 
copy
    + * of this software and associated documentation files (the "Software"), 
to deal
    + * in the Software without restriction, including without limitation the 
rights
    + * to use, copy, modify, merge, publish, distribute, sublicense, and/or 
sell
    + * copies of the Software, and to permit persons to whom the Software is
    + * furnished to do so, subject to the following conditions:
    + *
    + * The above copyright notice and this permission notice shall be included 
in
    + * all copies or substantial portions of the Software.
    + *
    + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 
OR
    + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
THE
    + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
    + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 
IN
    + * THE SOFTWARE.
    + *
    + 
*===-----------------------------------------------------------------------===
    + */
    +
    +#ifndef __EMMINTRIN_H
    +#define __EMMINTRIN_H
    +
    +#include <xmmintrin.h>
    +
    +typedef double __m128d __attribute__((__vector_size__(16)));
    +typedef long long __m128i __attribute__((__vector_size__(16)));
    +
    +/* Type defines.  */
    +typedef double __v2df __attribute__ ((__vector_size__ (16)));
    +typedef long long __v2di __attribute__ ((__vector_size__ (16)));
    +typedef short __v8hi __attribute__((__vector_size__(16)));
    +typedef char __v16qi __attribute__((__vector_size__(16)));
    +
    +/* Unsigned types */
    +typedef unsigned long long __v2du __attribute__ ((__vector_size__ (16)));
    +typedef unsigned short __v8hu __attribute__((__vector_size__(16)));
    +typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
    +
    +/* We need an explicitly signed variant for char. Note that this shouldn't
    + * appear in the interface though. */
    +typedef signed char __v16qs __attribute__((__vector_size__(16)));
    +
    +/* Define the default attributes for the functions in this file. */
    +#ifdef  __GNUC__
    +#define __DEFAULT_FN_ATTRS __attribute__((__gnu_inline__, 
__always_inline__, __artificial__))
    +#define __DEFAULT_FN_ATTRS_MMX __attribute__((__gnu_inline__, 
__always_inline__, __artificial__))
    +#else
    +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, 
__target__("sse2"), __min_vector_width__(128)))
    +#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, 
__nodebug__, __target__("mmx,sse2"), __min_vector_width__(64)))
    +#endif
    +
    +#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
    +
    +/// Adds lower double-precision values in both operands and returns the
    +///    sum in the lower 64 bits of the result. The upper 64 bits of the 
result
    +///    are copied from the upper double-precision value of the first 
operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VADDSD / ADDSD </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double] containing one of the source 
operands.
    +/// \param __b
    +///    A 128-bit vector of [2 x double] containing one of the source 
operands.
    +/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain 
the
    +///    sum of the lower 64 bits of both operands. The upper 64 bits are 
copied
    +///    from the upper 64 bits of the first source operand.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_add_sd(__m128d __a, __m128d __b)
    +{
    +  __a[0] += __b[0];
    +  return __a;
    +}
    +
    +/// Adds two 128-bit vectors of [2 x double].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VADDPD / ADDPD </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double] containing one of the source 
operands.
    +/// \param __b
    +///    A 128-bit vector of [2 x double] containing one of the source 
operands.
    +/// \returns A 128-bit vector of [2 x double] containing the sums of both
    +///    operands.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_add_pd(__m128d __a, __m128d __b)
    +{
    +  return (__m128d)((__v2df)__a + (__v2df)__b);
    +}
    +
    +/// Subtracts the lower double-precision value of the second operand
    +///    from the lower double-precision value of the first operand and 
returns
    +///    the difference in the lower 64 bits of the result. The upper 64 
bits of
    +///    the result are copied from the upper double-precision value of the 
first
    +///    operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VSUBSD / SUBSD </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double] containing the minuend.
    +/// \param __b
    +///    A 128-bit vector of [2 x double] containing the subtrahend.
    +/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain 
the
    +///    difference of the lower 64 bits of both operands. The upper 64 bits 
are
    +///    copied from the upper 64 bits of the first source operand.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_sub_sd(__m128d __a, __m128d __b)
    +{
    +  __a[0] -= __b[0];
    +  return __a;
    +}
    +
    +/// Subtracts two 128-bit vectors of [2 x double].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VSUBPD / SUBPD </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double] containing the minuend.
    +/// \param __b
    +///    A 128-bit vector of [2 x double] containing the subtrahend.
    +/// \returns A 128-bit vector of [2 x double] containing the differences 
between
    +///    both operands.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_sub_pd(__m128d __a, __m128d __b)
    +{
    +  return (__m128d)((__v2df)__a - (__v2df)__b);
    +}
    +
    +/// Multiplies lower double-precision values in both operands and returns
    +///    the product in the lower 64 bits of the result. The upper 64 bits 
of the
    +///    result are copied from the upper double-precision value of the first
    +///    operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMULSD / MULSD </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double] containing one of the source 
operands.
    +/// \param __b
    +///    A 128-bit vector of [2 x double] containing one of the source 
operands.
    +/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain 
the
    +///    product of the lower 64 bits of both operands. The upper 64 bits are
    +///    copied from the upper 64 bits of the first source operand.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_mul_sd(__m128d __a, __m128d __b)
    +{
    +  __a[0] *= __b[0];
    +  return __a;
    +}
    +
    +/// Multiplies two 128-bit vectors of [2 x double].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMULPD / MULPD </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double] containing one of the operands.
    +/// \param __b
    +///    A 128-bit vector of [2 x double] containing one of the operands.
    +/// \returns A 128-bit vector of [2 x double] containing the products of 
both
    +///    operands.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_mul_pd(__m128d __a, __m128d __b)
    +{
    +  return (__m128d)((__v2df)__a * (__v2df)__b);
    +}
    +
    +/// Divides the lower double-precision value of the first operand by the
    +///    lower double-precision value of the second operand and returns the
    +///    quotient in the lower 64 bits of the result. The upper 64 bits of 
the
    +///    result are copied from the upper double-precision value of the first
    +///    operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VDIVSD / DIVSD </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double] containing the dividend.
    +/// \param __b
    +///    A 128-bit vector of [2 x double] containing divisor.
    +/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain 
the
    +///    quotient of the lower 64 bits of both operands. The upper 64 bits 
are
    +///    copied from the upper 64 bits of the first source operand.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_div_sd(__m128d __a, __m128d __b)
    +{
    +  __a[0] /= __b[0];
    +  return __a;
    +}
    +
    +/// Performs an element-by-element division of two 128-bit vectors of
    +///    [2 x double].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VDIVPD / DIVPD </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double] containing the dividend.
    +/// \param __b
    +///    A 128-bit vector of [2 x double] containing the divisor.
    +/// \returns A 128-bit vector of [2 x double] containing the quotients of 
both
    +///    operands.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_div_pd(__m128d __a, __m128d __b)
    +{
    +  return (__m128d)((__v2df)__a / (__v2df)__b);
    +}
    +
    +/// Calculates the square root of the lower double-precision value of
    +///    the second operand and returns it in the lower 64 bits of the 
result.
    +///    The upper 64 bits of the result are copied from the upper
    +///    double-precision value of the first operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VSQRTSD / SQRTSD </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double] containing one of the operands. The
    +///    upper 64 bits of this operand are copied to the upper 64 bits of the
    +///    result.
    +/// \param __b
    +///    A 128-bit vector of [2 x double] containing one of the operands. The
    +///    square root is calculated using the lower 64 bits of this operand.
    +/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain 
the
    +///    square root of the lower 64 bits of operand \a __b, and whose upper 
64
    +///    bits are copied from the upper 64 bits of operand \a __a.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_sqrt_sd(__m128d __a, __m128d __b)
    +{
    +  __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b);
    +  return __extension__ (__m128d) { __c[0], __a[1] };
    +}
    +
    +/// Calculates the square root of the each of two values stored in a
    +///    128-bit vector of [2 x double].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VSQRTPD / SQRTPD </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double].
    +/// \returns A 128-bit vector of [2 x double] containing the square roots 
of the
    +///    values in the operand.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_sqrt_pd(__m128d __a)
    +{
    +  return __builtin_ia32_sqrtpd((__v2df)__a);
    +}
    +
    +/// Compares lower 64-bit double-precision values of both operands, and
    +///    returns the lesser of the pair of values in the lower 64-bits of the
    +///    result. The upper 64 bits of the result are copied from the upper
    +///    double-precision value of the first operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMINSD / MINSD </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double] containing one of the operands. The
    +///    lower 64 bits of this operand are used in the comparison.
    +/// \param __b
    +///    A 128-bit vector of [2 x double] containing one of the operands. The
    +///    lower 64 bits of this operand are used in the comparison.
    +/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain 
the
    +///    minimum value between both operands. The upper 64 bits are copied 
from
    +///    the upper 64 bits of the first source operand.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_min_sd(__m128d __a, __m128d __b)
    +{
    +  return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b);
    +}
    +
    +/// Performs element-by-element comparison of the two 128-bit vectors of
    +///    [2 x double] and returns the vector containing the lesser of each 
pair of
    +///    values.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMINPD / MINPD </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double] containing one of the operands.
    +/// \param __b
    +///    A 128-bit vector of [2 x double] containing one of the operands.
    +/// \returns A 128-bit vector of [2 x double] containing the minimum values
    +///    between both operands.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_min_pd(__m128d __a, __m128d __b)
    +{
    +  return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b);
    +}
    +
    +/// Compares lower 64-bit double-precision values of both operands, and
    +///    returns the greater of the pair of values in the lower 64-bits of 
the
    +///    result. The upper 64 bits of the result are copied from the upper
    +///    double-precision value of the first operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMAXSD / MAXSD </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double] containing one of the operands. The
    +///    lower 64 bits of this operand are used in the comparison.
    +/// \param __b
    +///    A 128-bit vector of [2 x double] containing one of the operands. The
    +///    lower 64 bits of this operand are used in the comparison.
    +/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain 
the
    +///    maximum value between both operands. The upper 64 bits are copied 
from
    +///    the upper 64 bits of the first source operand.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_max_sd(__m128d __a, __m128d __b)
    +{
    +  return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b);
    +}
    +
    +/// Performs element-by-element comparison of the two 128-bit vectors of
    +///    [2 x double] and returns the vector containing the greater of each 
pair
    +///    of values.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMAXPD / MAXPD </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double] containing one of the operands.
    +/// \param __b
    +///    A 128-bit vector of [2 x double] containing one of the operands.
    +/// \returns A 128-bit vector of [2 x double] containing the maximum values
    +///    between both operands.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_max_pd(__m128d __a, __m128d __b)
    +{
    +  return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b);
    +}
    +
    +/// Performs a bitwise AND of two 128-bit vectors of [2 x double].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double] containing one of the source 
operands.
    +/// \param __b
    +///    A 128-bit vector of [2 x double] containing one of the source 
operands.
    +/// \returns A 128-bit vector of [2 x double] containing the bitwise AND 
of the
    +///    values between both operands.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_and_pd(__m128d __a, __m128d __b)
    +{
    +  return (__m128d)((__v2du)__a & (__v2du)__b);
    +}
    +
    +/// Performs a bitwise AND of two 128-bit vectors of [2 x double], using
    +///    the one's complement of the values contained in the first source 
operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double] containing the left source 
operand. The
    +///    one's complement of this value is used in the bitwise AND.
    +/// \param __b
    +///    A 128-bit vector of [2 x double] containing the right source 
operand.
    +/// \returns A 128-bit vector of [2 x double] containing the bitwise AND 
of the
    +///    values in the second operand and the one's complement of the first
    +///    operand.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_andnot_pd(__m128d __a, __m128d __b)
    +{
    +  return (__m128d)(~(__v2du)__a & (__v2du)__b);
    +}
    +
    +/// Performs a bitwise OR of two 128-bit vectors of [2 x double].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double] containing one of the source 
operands.
    +/// \param __b
    +///    A 128-bit vector of [2 x double] containing one of the source 
operands.
    +/// \returns A 128-bit vector of [2 x double] containing the bitwise OR of 
the
    +///    values between both operands.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_or_pd(__m128d __a, __m128d __b)
    +{
    +  return (__m128d)((__v2du)__a | (__v2du)__b);
    +}
    +
    +/// Performs a bitwise XOR of two 128-bit vectors of [2 x double].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double] containing one of the source 
operands.
    +/// \param __b
    +///    A 128-bit vector of [2 x double] containing one of the source 
operands.
    +/// \returns A 128-bit vector of [2 x double] containing the bitwise XOR 
of the
    +///    values between both operands.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_xor_pd(__m128d __a, __m128d __b)
    +{
    +  return (__m128d)((__v2du)__a ^ (__v2du)__b);
    +}
    +
    +/// Compares each of the corresponding double-precision values of the
    +///    128-bit vectors of [2 x double] for equality. Each comparison 
yields 0x0
    +///    for false, 0xFFFFFFFFFFFFFFFF for true.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCMPEQPD / CMPEQPD </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double].
    +/// \param __b
    +///    A 128-bit vector of [2 x double].
    +/// \returns A 128-bit vector containing the comparison results.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_cmpeq_pd(__m128d __a, __m128d __b)
    +{
    +  return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b);
    +}
    +
    +/// Compares each of the corresponding double-precision values of the
    +///    128-bit vectors of [2 x double] to determine if the values in the 
first
    +///    operand are less than those in the second operand. Each comparison
    +///    yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double].
    +/// \param __b
    +///    A 128-bit vector of [2 x double].
    +/// \returns A 128-bit vector containing the comparison results.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_cmplt_pd(__m128d __a, __m128d __b)
    +{
    +  return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b);
    +}
    +
    +/// Compares each of the corresponding double-precision values of the
    +///    128-bit vectors of [2 x double] to determine if the values in the 
first
    +///    operand are less than or equal to those in the second operand.
    +///
    +///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double].
    +/// \param __b
    +///    A 128-bit vector of [2 x double].
    +/// \returns A 128-bit vector containing the comparison results.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_cmple_pd(__m128d __a, __m128d __b)
    +{
    +  return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b);
    +}
    +
    +/// Compares each of the corresponding double-precision values of the
    +///    128-bit vectors of [2 x double] to determine if the values in the 
first
    +///    operand are greater than those in the second operand.
    +///
    +///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double].
    +/// \param __b
    +///    A 128-bit vector of [2 x double].
    +/// \returns A 128-bit vector containing the comparison results.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_cmpgt_pd(__m128d __a, __m128d __b)
    +{
    +  return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a);
    +}
    +
    +/// Compares each of the corresponding double-precision values of the
    +///    128-bit vectors of [2 x double] to determine if the values in the 
first
    +///    operand are greater than or equal to those in the second operand.
    +///
    +///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double].
    +/// \param __b
    +///    A 128-bit vector of [2 x double].
    +/// \returns A 128-bit vector containing the comparison results.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_cmpge_pd(__m128d __a, __m128d __b)
    +{
    +  return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a);
    +}
    +
    +/// Compares each of the corresponding double-precision values of the
    +///    128-bit vectors of [2 x double] to determine if the values in the 
first
    +///    operand are ordered with respect to those in the second operand.
    +///
    +///    A pair of double-precision values are "ordered" with respect to each
    +///    other if neither value is a NaN. Each comparison yields 0x0 for 
false,
    +///    0xFFFFFFFFFFFFFFFF for true.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCMPORDPD / CMPORDPD </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double].
    +/// \param __b
    +///    A 128-bit vector of [2 x double].
    +/// \returns A 128-bit vector containing the comparison results.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_cmpord_pd(__m128d __a, __m128d __b)
    +{
    +  return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b);
    +}
    +
    +/// Compares each of the corresponding double-precision values of the
    +///    128-bit vectors of [2 x double] to determine if the values in the 
first
    +///    operand are unordered with respect to those in the second operand.
    +///
    +///    A pair of double-precision values are "unordered" with respect to 
each
    +///    other if one or both values are NaN. Each comparison yields 0x0 for
    +///    false, 0xFFFFFFFFFFFFFFFF for true.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCMPUNORDPD / CMPUNORDPD </c>
    +///   instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double].
    +/// \param __b
    +///    A 128-bit vector of [2 x double].
    +/// \returns A 128-bit vector containing the comparison results.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_cmpunord_pd(__m128d __a, __m128d __b)
    +{
    +  return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b);
    +}
    +
    +/// Compares each of the corresponding double-precision values of the
    +///    128-bit vectors of [2 x double] to determine if the values in the 
first
    +///    operand are unequal to those in the second operand.
    +///
    +///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCMPNEQPD / CMPNEQPD </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double].
    +/// \param __b
    +///    A 128-bit vector of [2 x double].
    +/// \returns A 128-bit vector containing the comparison results.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_cmpneq_pd(__m128d __a, __m128d __b)
    +{
    +  return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b);
    +}
    +
    +/// Compares each of the corresponding double-precision values of the
    +///    128-bit vectors of [2 x double] to determine if the values in the 
first
    +///    operand are not less than those in the second operand.
    +///
    +///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double].
    +/// \param __b
    +///    A 128-bit vector of [2 x double].
    +/// \returns A 128-bit vector containing the comparison results.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_cmpnlt_pd(__m128d __a, __m128d __b)
    +{
    +  return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b);
    +}
    +
    +/// Compares each of the corresponding double-precision values of the
    +///    128-bit vectors of [2 x double] to determine if the values in the 
first
    +///    operand are not less than or equal to those in the second operand.
    +///
    +///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double].
    +/// \param __b
    +///    A 128-bit vector of [2 x double].
    +/// \returns A 128-bit vector containing the comparison results.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_cmpnle_pd(__m128d __a, __m128d __b)
    +{
    +  return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b);
    +}
    +
    +/// Compares each of the corresponding double-precision values of the
    +///    128-bit vectors of [2 x double] to determine if the values in the 
first
    +///    operand are not greater than those in the second operand.
    +///
    +///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double].
    +/// \param __b
    +///    A 128-bit vector of [2 x double].
    +/// \returns A 128-bit vector containing the comparison results.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_cmpngt_pd(__m128d __a, __m128d __b)
    +{
    +  return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a);
    +}
    +
    +/// Compares each of the corresponding double-precision values of the
    +///    128-bit vectors of [2 x double] to determine if the values in the 
first
    +///    operand are not greater than or equal to those in the second 
operand.
    +///
    +///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double].
    +/// \param __b
    +///    A 128-bit vector of [2 x double].
    +/// \returns A 128-bit vector containing the comparison results.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_cmpnge_pd(__m128d __a, __m128d __b)
    +{
    +  return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a);
    +}
    +
    +/// Compares the lower double-precision floating-point values in each of
    +///    the two 128-bit floating-point vectors of [2 x double] for equality.
    +///
    +///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCMPEQSD / CMPEQSD </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double]. The lower double-precision value 
is
    +///    compared to the lower double-precision value of \a __b.
    +/// \param __b
    +///    A 128-bit vector of [2 x double]. The lower double-precision value 
is
    +///    compared to the lower double-precision value of \a __a.
    +/// \returns A 128-bit vector. The lower 64 bits contains the comparison
    +///    results. The upper 64 bits are copied from the upper 64 bits of \a 
__a.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_cmpeq_sd(__m128d __a, __m128d __b)
    +{
    +  return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b);
    +}
    +
    +/// Compares the lower double-precision floating-point values in each of
    +///    the two 128-bit floating-point vectors of [2 x double] to determine 
if
    +///    the value in the first parameter is less than the corresponding 
value in
    +///    the second parameter.
    +///
    +///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double]. The lower double-precision value 
is
    +///    compared to the lower double-precision value of \a __b.
    +/// \param __b
    +///    A 128-bit vector of [2 x double]. The lower double-precision value 
is
    +///    compared to the lower double-precision value of \a __a.
    +/// \returns A 128-bit vector. The lower 64 bits contains the comparison
    +///    results. The upper 64 bits are copied from the upper 64 bits of \a 
__a.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_cmplt_sd(__m128d __a, __m128d __b)
    +{
    +  return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b);
    +}
    +
    +/// Compares the lower double-precision floating-point values in each of
    +///    the two 128-bit floating-point vectors of [2 x double] to determine 
if
    +///    the value in the first parameter is less than or equal to the
    +///    corresponding value in the second parameter.
    +///
    +///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double]. The lower double-precision value 
is
    +///    compared to the lower double-precision value of \a __b.
    +/// \param __b
    +///    A 128-bit vector of [2 x double]. The lower double-precision value 
is
    +///    compared to the lower double-precision value of \a __a.
    +/// \returns A 128-bit vector. The lower 64 bits contains the comparison
    +///    results. The upper 64 bits are copied from the upper 64 bits of \a 
__a.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_cmple_sd(__m128d __a, __m128d __b)
    +{
    +  return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b);
    +}
    +
    +/// Compares the lower double-precision floating-point values in each of
    +///    the two 128-bit floating-point vectors of [2 x double] to determine 
if
    +///    the value in the first parameter is greater than the corresponding 
value
    +///    in the second parameter.
    +///
    +///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> 
instruction.
    +///
    +/// \param __a
    +///     A 128-bit vector of [2 x double]. The lower double-precision value 
is
    +///     compared to the lower double-precision value of \a __b.
    +/// \param __b
    +///     A 128-bit vector of [2 x double]. The lower double-precision value 
is
    +///     compared to the lower double-precision value of \a __a.
    +/// \returns A 128-bit vector. The lower 64 bits contains the comparison
    +///     results. The upper 64 bits are copied from the upper 64 bits of \a 
__a.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_cmpgt_sd(__m128d __a, __m128d __b)
    +{
    +  __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a);
    +  return __extension__ (__m128d) { __c[0], __a[1] };
    +}
    +
    +/// Compares the lower double-precision floating-point values in each of
    +///    the two 128-bit floating-point vectors of [2 x double] to determine 
if
    +///    the value in the first parameter is greater than or equal to the
    +///    corresponding value in the second parameter.
    +///
    +///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double]. The lower double-precision value 
is
    +///    compared to the lower double-precision value of \a __b.
    +/// \param __b
    +///    A 128-bit vector of [2 x double]. The lower double-precision value 
is
    +///    compared to the lower double-precision value of \a __a.
    +/// \returns A 128-bit vector. The lower 64 bits contains the comparison
    +///    results. The upper 64 bits are copied from the upper 64 bits of \a 
__a.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_cmpge_sd(__m128d __a, __m128d __b)
    +{
    +  __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a);
    +  return __extension__ (__m128d) { __c[0], __a[1] };
    +}
    +
    +/// Compares the lower double-precision floating-point values in each of
    +///    the two 128-bit floating-point vectors of [2 x double] to determine 
if
    +///    the value in the first parameter is "ordered" with respect to the
    +///    corresponding value in the second parameter.
    +///
    +///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A 
pair
    +///    of double-precision values are "ordered" with respect to each other 
if
    +///    neither value is a NaN.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCMPORDSD / CMPORDSD </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double]. The lower double-precision value 
is
    +///    compared to the lower double-precision value of \a __b.
    +/// \param __b
    +///    A 128-bit vector of [2 x double]. The lower double-precision value 
is
    +///    compared to the lower double-precision value of \a __a.
    +/// \returns A 128-bit vector. The lower 64 bits contains the comparison
    +///    results. The upper 64 bits are copied from the upper 64 bits of \a 
__a.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_cmpord_sd(__m128d __a, __m128d __b)
    +{
    +  return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b);
    +}
    +
    +/// Compares the lower double-precision floating-point values in each of
    +///    the two 128-bit floating-point vectors of [2 x double] to determine 
if
    +///    the value in the first parameter is "unordered" with respect to the
    +///    corresponding value in the second parameter.
    +///
    +///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A 
pair
    +///    of double-precision values are "unordered" with respect to each 
other if
    +///    one or both values are NaN.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCMPUNORDSD / CMPUNORDSD </c>
    +///   instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double]. The lower double-precision value 
is
    +///    compared to the lower double-precision value of \a __b.
    +/// \param __b
    +///    A 128-bit vector of [2 x double]. The lower double-precision value 
is
    +///    compared to the lower double-precision value of \a __a.
    +/// \returns A 128-bit vector. The lower 64 bits contains the comparison
    +///    results. The upper 64 bits are copied from the upper 64 bits of \a 
__a.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_cmpunord_sd(__m128d __a, __m128d __b)
    +{
    +  return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b);
    +}
    +
    +/// Compares the lower double-precision floating-point values in each of
    +///    the two 128-bit floating-point vectors of [2 x double] to determine 
if
    +///    the value in the first parameter is unequal to the corresponding 
value in
    +///    the second parameter.
    +///
    +///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCMPNEQSD / CMPNEQSD </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double]. The lower double-precision value 
is
    +///    compared to the lower double-precision value of \a __b.
    +/// \param __b
    +///    A 128-bit vector of [2 x double]. The lower double-precision value 
is
    +///    compared to the lower double-precision value of \a __a.
    +/// \returns A 128-bit vector. The lower 64 bits contains the comparison
    +///    results. The upper 64 bits are copied from the upper 64 bits of \a 
__a.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_cmpneq_sd(__m128d __a, __m128d __b)
    +{
    +  return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b);
    +}
    +
    +/// Compares the lower double-precision floating-point values in each of
    +///    the two 128-bit floating-point vectors of [2 x double] to determine 
if
    +///    the value in the first parameter is not less than the corresponding
    +///    value in the second parameter.
    +///
    +///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double]. The lower double-precision value 
is
    +///    compared to the lower double-precision value of \a __b.
    +/// \param __b
    +///    A 128-bit vector of [2 x double]. The lower double-precision value 
is
    +///    compared to the lower double-precision value of \a __a.
    +/// \returns A 128-bit vector. The lower 64 bits contains the comparison
    +///    results. The upper 64 bits are copied from the upper 64 bits of \a 
__a.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_cmpnlt_sd(__m128d __a, __m128d __b)
    +{
    +  return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b);
    +}
    +
    +/// Compares the lower double-precision floating-point values in each of
    +///    the two 128-bit floating-point vectors of [2 x double] to determine 
if
    +///    the value in the first parameter is not less than or equal to the
    +///    corresponding value in the second parameter.
    +///
    +///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double]. The lower double-precision value 
is
    +///    compared to the lower double-precision value of \a __b.
    +/// \param __b
    +///    A 128-bit vector of [2 x double]. The lower double-precision value 
is
    +///    compared to the lower double-precision value of \a __a.
    +/// \returns  A 128-bit vector. The lower 64 bits contains the comparison
    +///    results. The upper 64 bits are copied from the upper 64 bits of \a 
__a.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_cmpnle_sd(__m128d __a, __m128d __b)
    +{
    +  return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b);
    +}
    +
    +/// Compares the lower double-precision floating-point values in each of
    +///    the two 128-bit floating-point vectors of [2 x double] to determine 
if
    +///    the value in the first parameter is not greater than the 
corresponding
    +///    value in the second parameter.
    +///
    +///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double]. The lower double-precision value 
is
    +///    compared to the lower double-precision value of \a __b.
    +/// \param __b
    +///    A 128-bit vector of [2 x double]. The lower double-precision value 
is
    +///    compared to the lower double-precision value of \a __a.
    +/// \returns A 128-bit vector. The lower 64 bits contains the comparison
    +///    results. The upper 64 bits are copied from the upper 64 bits of \a 
__a.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_cmpngt_sd(__m128d __a, __m128d __b)
    +{
    +  __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a);
    +  return __extension__ (__m128d) { __c[0], __a[1] };
    +}
    +
    +/// Compares the lower double-precision floating-point values in each of
    +///    the two 128-bit floating-point vectors of [2 x double] to determine 
if
    +///    the value in the first parameter is not greater than or equal to the
    +///    corresponding value in the second parameter.
    +///
    +///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double]. The lower double-precision value 
is
    +///    compared to the lower double-precision value of \a __b.
    +/// \param __b
    +///    A 128-bit vector of [2 x double]. The lower double-precision value 
is
    +///    compared to the lower double-precision value of \a __a.
    +/// \returns A 128-bit vector. The lower 64 bits contains the comparison
    +///    results. The upper 64 bits are copied from the upper 64 bits of \a 
__a.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_cmpnge_sd(__m128d __a, __m128d __b)
    +{
    +  __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a);
    +  return __extension__ (__m128d) { __c[0], __a[1] };
    +}
    +
    +/// Compares the lower double-precision floating-point values in each of
    +///    the two 128-bit floating-point vectors of [2 x double] for equality.
    +///
    +///    The comparison yields 0 for false, 1 for true. If either of the two
    +///    lower double-precision values is NaN, 0 is returned.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double]. The lower double-precision value 
is
    +///    compared to the lower double-precision value of \a __b.
    +/// \param __b
    +///    A 128-bit vector of [2 x double]. The lower double-precision value 
is
    +///    compared to the lower double-precision value of \a __a.
    +/// \returns An integer containing the comparison results. If either of 
the two
    +///    lower double-precision values is NaN, 0 is returned.
    +static __inline__ int __DEFAULT_FN_ATTRS
    +_mm_comieq_sd(__m128d __a, __m128d __b)
    +{
    +  return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b);
    +}
    +
    +/// Compares the lower double-precision floating-point values in each of
    +///    the two 128-bit floating-point vectors of [2 x double] to determine 
if
    +///    the value in the first parameter is less than the corresponding 
value in
    +///    the second parameter.
    +///
    +///    The comparison yields 0 for false, 1 for true. If either of the two
    +///    lower double-precision values is NaN, 0 is returned.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double]. The lower double-precision value 
is
    +///    compared to the lower double-precision value of \a __b.
    +/// \param __b
    +///    A 128-bit vector of [2 x double]. The lower double-precision value 
is
    +///    compared to the lower double-precision value of \a __a.
    +/// \returns An integer containing the comparison results. If either of 
the two
    +///     lower double-precision values is NaN, 0 is returned.
    +static __inline__ int __DEFAULT_FN_ATTRS
    +_mm_comilt_sd(__m128d __a, __m128d __b)
    +{
    +  return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b);
    +}
    +
    +/// Compares the lower double-precision floating-point values in each of
    +///    the two 128-bit floating-point vectors of [2 x double] to determine 
if
    +///    the value in the first parameter is less than or equal to the
    +///    corresponding value in the second parameter.
    +///
    +///    The comparison yields 0 for false, 1 for true. If either of the two
    +///    lower double-precision values is NaN, 0 is returned.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double]. The lower double-precision value 
is
    +///    compared to the lower double-precision value of \a __b.
    +/// \param __b
    +///     A 128-bit vector of [2 x double]. The lower double-precision value 
is
    +///     compared to the lower double-precision value of \a __a.
    +/// \returns An integer containing the comparison results. If either of 
the two
    +///     lower double-precision values is NaN, 0 is returned.
    +static __inline__ int __DEFAULT_FN_ATTRS
    +_mm_comile_sd(__m128d __a, __m128d __b)
    +{
    +  return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b);
    +}
    +
    +/// Compares the lower double-precision floating-point values in each of
    +///    the two 128-bit floating-point vectors of [2 x double] to determine 
if
    +///    the value in the first parameter is greater than the corresponding 
value
    +///    in the second parameter.
    +///
    +///    The comparison yields 0 for false, 1 for true. If either of the two
    +///    lower double-precision values is NaN, 0 is returned.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double]. The lower double-precision value 
is
    +///    compared to the lower double-precision value of \a __b.
    +/// \param __b
    +///    A 128-bit vector of [2 x double]. The lower double-precision value 
is
    +///    compared to the lower double-precision value of \a __a.
    +/// \returns An integer containing the comparison results. If either of 
the two
    +///     lower double-precision values is NaN, 0 is returned.
    +static __inline__ int __DEFAULT_FN_ATTRS
    +_mm_comigt_sd(__m128d __a, __m128d __b)
    +{
    +  return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b);
    +}
    +
    +/// Compares the lower double-precision floating-point values in each of
    +///    the two 128-bit floating-point vectors of [2 x double] to determine 
if
    +///    the value in the first parameter is greater than or equal to the
    +///    corresponding value in the second parameter.
    +///
    +///    The comparison yields 0 for false, 1 for true. If either of the two
    +///    lower double-precision values is NaN, 0 is returned.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double]. The lower double-precision value 
is
    +///    compared to the lower double-precision value of \a __b.
    +/// \param __b
    +///    A 128-bit vector of [2 x double]. The lower double-precision value 
is
    +///    compared to the lower double-precision value of \a __a.
    +/// \returns An integer containing the comparison results. If either of 
the two
    +///    lower double-precision values is NaN, 0 is returned.
    +static __inline__ int __DEFAULT_FN_ATTRS
    +_mm_comige_sd(__m128d __a, __m128d __b)
    +{
    +  return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b);
    +}
    +
    +/// Compares the lower double-precision floating-point values in each of
    +///    the two 128-bit floating-point vectors of [2 x double] to determine 
if
    +///    the value in the first parameter is unequal to the corresponding 
value in
    +///    the second parameter.
    +///
    +///    The comparison yields 0 for false, 1 for true. If either of the two
    +///    lower double-precision values is NaN, 1 is returned.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double]. The lower double-precision value 
is
    +///    compared to the lower double-precision value of \a __b.
    +/// \param __b
    +///    A 128-bit vector of [2 x double]. The lower double-precision value 
is
    +///    compared to the lower double-precision value of \a __a.
    +/// \returns An integer containing the comparison results. If either of 
the two
    +///     lower double-precision values is NaN, 1 is returned.
    +static __inline__ int __DEFAULT_FN_ATTRS
    +_mm_comineq_sd(__m128d __a, __m128d __b)
    +{
    +  return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b);
    +}
    +
    +/// Compares the lower double-precision floating-point values in each of
    +///    the two 128-bit floating-point vectors of [2 x double] for 
equality. The
    +///    comparison yields 0 for false, 1 for true.
    +///
    +///    If either of the two lower double-precision values is NaN, 0 is 
returned.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double]. The lower double-precision value 
is
    +///    compared to the lower double-precision value of \a __b.
    +/// \param __b
    +///    A 128-bit vector of [2 x double]. The lower double-precision value 
is
    +///    compared to the lower double-precision value of \a __a.
    +/// \returns An integer containing the comparison results. If either of 
the two
    +///    lower double-precision values is NaN, 0 is returned.
    +static __inline__ int __DEFAULT_FN_ATTRS
    +_mm_ucomieq_sd(__m128d __a, __m128d __b)
    +{
    +  return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b);
    +}
    +
    +/// Compares the lower double-precision floating-point values in each of
    +///    the two 128-bit floating-point vectors of [2 x double] to determine 
if
    +///    the value in the first parameter is less than the corresponding 
value in
    +///    the second parameter.
    +///
    +///    The comparison yields 0 for false, 1 for true. If either of the two 
lower
    +///    double-precision values is NaN, 0 is returned.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double]. The lower double-precision value 
is
    +///    compared to the lower double-precision value of \a __b.
    +/// \param __b
    +///    A 128-bit vector of [2 x double]. The lower double-precision value 
is
    +///    compared to the lower double-precision value of \a __a.
    +/// \returns An integer containing the comparison results. If either of 
the two
    +///    lower double-precision values is NaN, 0 is returned.
    +static __inline__ int __DEFAULT_FN_ATTRS
    +_mm_ucomilt_sd(__m128d __a, __m128d __b)
    +{
    +  return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b);
    +}
    +
    +/// Compares the lower double-precision floating-point values in each of
    +///    the two 128-bit floating-point vectors of [2 x double] to determine 
if
    +///    the value in the first parameter is less than or equal to the
    +///    corresponding value in the second parameter.
    +///
    +///    The comparison yields 0 for false, 1 for true. If either of the two 
lower
    +///    double-precision values is NaN, 0 is returned.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double]. The lower double-precision value 
is
    +///    compared to the lower double-precision value of \a __b.
    +/// \param __b
    +///     A 128-bit vector of [2 x double]. The lower double-precision value 
is
    +///     compared to the lower double-precision value of \a __a.
    +/// \returns An integer containing the comparison results. If either of 
the two
    +///     lower double-precision values is NaN, 0 is returned.
    +static __inline__ int __DEFAULT_FN_ATTRS
    +_mm_ucomile_sd(__m128d __a, __m128d __b)
    +{
    +  return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b);
    +}
    +
    +/// Compares the lower double-precision floating-point values in each of
    +///    the two 128-bit floating-point vectors of [2 x double] to determine 
if
    +///    the value in the first parameter is greater than the corresponding 
value
    +///    in the second parameter.
    +///
    +///    The comparison yields 0 for false, 1 for true. If either of the two 
lower
    +///    double-precision values is NaN, 0 is returned.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double]. The lower double-precision value 
is
    +///    compared to the lower double-precision value of \a __b.
    +/// \param __b
    +///     A 128-bit vector of [2 x double]. The lower double-precision value 
is
    +///     compared to the lower double-precision value of \a __a.
    +/// \returns An integer containing the comparison results. If either of 
the two
    +///     lower double-precision values is NaN, 0 is returned.
    +static __inline__ int __DEFAULT_FN_ATTRS
    +_mm_ucomigt_sd(__m128d __a, __m128d __b)
    +{
    +  return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b);
    +}
    +
    +/// Compares the lower double-precision floating-point values in each of
    +///    the two 128-bit floating-point vectors of [2 x double] to determine 
if
    +///    the value in the first parameter is greater than or equal to the
    +///    corresponding value in the second parameter.
    +///
    +///    The comparison yields 0 for false, 1 for true.  If either of the two
    +///    lower double-precision values is NaN, 0 is returned.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double]. The lower double-precision value 
is
    +///    compared to the lower double-precision value of \a __b.
    +/// \param __b
    +///    A 128-bit vector of [2 x double]. The lower double-precision value 
is
    +///    compared to the lower double-precision value of \a __a.
    +/// \returns An integer containing the comparison results. If either of 
the two
    +///    lower double-precision values is NaN, 0 is returned.
    +static __inline__ int __DEFAULT_FN_ATTRS
    +_mm_ucomige_sd(__m128d __a, __m128d __b)
    +{
    +  return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b);
    +}
    +
    +/// Compares the lower double-precision floating-point values in each of
    +///    the two 128-bit floating-point vectors of [2 x double] to determine 
if
    +///    the value in the first parameter is unequal to the corresponding 
value in
    +///    the second parameter.
    +///
    +///    The comparison yields 0 for false, 1 for true. If either of the two 
lower
    +///    double-precision values is NaN, 1 is returned.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double]. The lower double-precision value 
is
    +///    compared to the lower double-precision value of \a __b.
    +/// \param __b
    +///    A 128-bit vector of [2 x double]. The lower double-precision value 
is
    +///    compared to the lower double-precision value of \a __a.
    +/// \returns An integer containing the comparison result. If either of the 
two
    +///    lower double-precision values is NaN, 1 is returned.
    +static __inline__ int __DEFAULT_FN_ATTRS
    +_mm_ucomineq_sd(__m128d __a, __m128d __b)
    +{
    +  return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b);
    +}
    +
    +/// Converts the two double-precision floating-point elements of a
    +///    128-bit vector of [2 x double] into two single-precision 
floating-point
    +///    values, returned in the lower 64 bits of a 128-bit vector of [4 x 
float].
    +///    The upper 64 bits of the result vector are set to zero.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCVTPD2PS / CVTPD2PS </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double].
    +/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain 
the
    +///    converted values. The upper 64 bits are set to zero.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_cvtpd_ps(__m128d __a)
    +{
    +  return __builtin_ia32_cvtpd2ps((__v2df)__a);
    +}
    +
    +/// Converts the lower two single-precision floating-point elements of a
    +///    128-bit vector of [4 x float] into two double-precision 
floating-point
    +///    values, returned in a 128-bit vector of [2 x double]. The upper two
    +///    elements of the input vector are unused.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCVTPS2PD / CVTPS2PD </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float]. The lower two single-precision
    +///    floating-point elements are converted to double-precision values. 
The
    +///    upper two elements are unused.
    +/// \returns A 128-bit vector of [2 x double] containing the converted 
values.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_cvtps_pd(__m128 __a)
    +{
    +#ifdef __GNUC__
    +  return (__m128d)__builtin_ia32_cvtps2pd ((__v4sf) __a);
    +#else
    +  return (__m128d) __builtin_convertvector(
    +      __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df);
    +#endif
    +}
    +
    +/// Converts the lower two integer elements of a 128-bit vector of
    +///    [4 x i32] into two double-precision floating-point values, returned 
in a
    +///    128-bit vector of [2 x double].
    +///
    +///    The upper two elements of the input vector are unused.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCVTDQ2PD / CVTDQ2PD </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit integer vector of [4 x i32]. The lower two integer 
elements are
    +///    converted to double-precision values.
    +///
    +///    The upper two elements are unused.
    +/// \returns A 128-bit vector of [2 x double] containing the converted 
values.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_cvtepi32_pd(__m128i __a)
    +{
    +#ifdef __GNUC__
    +  return (__m128d)__builtin_ia32_cvtdq2pd ((__v4si) __a);
    +#else
    +  return (__m128d) __builtin_convertvector(
    +      __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df);
    +#endif
    +}
    +
    +/// Converts the two double-precision floating-point elements of a
    +///    128-bit vector of [2 x double] into two signed 32-bit integer 
values,
    +///    returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The 
upper
    +///    64 bits of the result vector are set to zero.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCVTPD2DQ / CVTPD2DQ </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double].
    +/// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
    +///    converted values. The upper 64 bits are set to zero.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_cvtpd_epi32(__m128d __a)
    +{
    +#ifdef __GNUC__
    +  return (__m128i)__builtin_ia32_cvtpd2dq ((__v2df) __a);
    +#else
    +  return __builtin_ia32_cvtpd2dq((__v2df)__a);
    +#endif
    +}
    +
    +/// Converts the low-order element of a 128-bit vector of [2 x double]
    +///    into a 32-bit signed integer value.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
    +///    conversion.
    +/// \returns A 32-bit signed integer containing the converted value.
    +static __inline__ int __DEFAULT_FN_ATTRS
    +_mm_cvtsd_si32(__m128d __a)
    +{
    +  return __builtin_ia32_cvtsd2si((__v2df)__a);
    +}
    +
    +/// Converts the lower double-precision floating-point element of a
    +///    128-bit vector of [2 x double], in the second parameter, into a
    +///    single-precision floating-point value, returned in the lower 32 
bits of a
    +///    128-bit vector of [4 x float]. The upper 96 bits of the result 
vector are
    +///    copied from the upper 96 bits of the first parameter.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCVTSD2SS / CVTSD2SS </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float]. The upper 96 bits of this 
parameter are
    +///    copied to the upper 96 bits of the result.
    +/// \param __b
    +///    A 128-bit vector of [2 x double]. The lower double-precision
    +///    floating-point element is used in the conversion.
    +/// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the
    +///    converted value from the second parameter. The upper 96 bits are 
copied
    +///    from the upper 96 bits of the first parameter.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_cvtsd_ss(__m128 __a, __m128d __b)
    +{
    +  return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b);
    +}
    +
    +/// Converts a 32-bit signed integer value, in the second parameter, into
    +///    a double-precision floating-point value, returned in the lower 64 
bits of
    +///    a 128-bit vector of [2 x double]. The upper 64 bits of the result 
vector
    +///    are copied from the upper 64 bits of the first parameter.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double]. The upper 64 bits of this 
parameter are
    +///    copied to the upper 64 bits of the result.
    +/// \param __b
    +///    A 32-bit signed integer containing the value to be converted.
    +/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain 
the
    +///    converted value from the second parameter. The upper 64 bits are 
copied
    +///    from the upper 64 bits of the first parameter.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_cvtsi32_sd(__m128d __a, int __b)
    +{
    +  __a[0] = __b;
    +  return __a;
    +}
    +
    +/// Converts the lower single-precision floating-point element of a
    +///    128-bit vector of [4 x float], in the second parameter, into a
    +///    double-precision floating-point value, returned in the lower 64 
bits of
    +///    a 128-bit vector of [2 x double]. The upper 64 bits of the result 
vector
    +///    are copied from the upper 64 bits of the first parameter.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCVTSS2SD / CVTSS2SD </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double]. The upper 64 bits of this 
parameter are
    +///    copied to the upper 64 bits of the result.
    +/// \param __b
    +///    A 128-bit vector of [4 x float]. The lower single-precision
    +///    floating-point element is used in the conversion.
    +/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain 
the
    +///    converted value from the second parameter. The upper 64 bits are 
copied
    +///    from the upper 64 bits of the first parameter.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_cvtss_sd(__m128d __a, __m128 __b)
    +{
    +  __a[0] = __b[0];
    +  return __a;
    +}
    +
    +/// Converts the two double-precision floating-point elements of a
    +///    128-bit vector of [2 x double] into two signed 32-bit integer 
values,
    +///    returned in the lower 64 bits of a 128-bit vector of [4 x i32].
    +///
    +///    If the result of either conversion is inexact, the result is 
truncated
    +///    (rounded towards zero) regardless of the current MXCSR setting. The 
upper
    +///    64 bits of the result vector are set to zero.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCVTTPD2DQ / CVTTPD2DQ </c>
    +///   instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double].
    +/// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
    +///    converted values. The upper 64 bits are set to zero.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_cvttpd_epi32(__m128d __a)
    +{
    +  return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a);
    +}
    +
    +/// Converts the low-order element of a [2 x double] vector into a 32-bit
    +///    signed integer value, truncating the result when it is inexact.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
    +///   instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
    +///    conversion.
    +/// \returns A 32-bit signed integer containing the converted value.
    +static __inline__ int __DEFAULT_FN_ATTRS
    +_mm_cvttsd_si32(__m128d __a)
    +{
    +  return __builtin_ia32_cvttsd2si((__v2df)__a);
    +}
    +
    +/// Converts the two double-precision floating-point elements of a
    +///    128-bit vector of [2 x double] into two signed 32-bit integer 
values,
    +///    returned in a 64-bit vector of [2 x i32].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> CVTPD2PI </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double].
    +/// \returns A 64-bit vector of [2 x i32] containing the converted values.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
    +_mm_cvtpd_pi32(__m128d __a)
    +{
    +  return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a);
    +}
    +
    +/// Converts the two double-precision floating-point elements of a
    +///    128-bit vector of [2 x double] into two signed 32-bit integer 
values,
    +///    returned in a 64-bit vector of [2 x i32].
    +///
    +///    If the result of either conversion is inexact, the result is 
truncated
    +///    (rounded towards zero) regardless of the current MXCSR setting.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> CVTTPD2PI </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double].
    +/// \returns A 64-bit vector of [2 x i32] containing the converted values.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
    +_mm_cvttpd_pi32(__m128d __a)
    +{
    +  return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a);
    +}
    +
    +/// Converts the two signed 32-bit integer elements of a 64-bit vector of
    +///    [2 x i32] into two double-precision floating-point values, returned 
in a
    +///    128-bit vector of [2 x double].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> CVTPI2PD </c> instruction.
    +///
    +/// \param __a
    +///    A 64-bit vector of [2 x i32].
    +/// \returns A 128-bit vector of [2 x double] containing the converted 
values.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX
    +_mm_cvtpi32_pd(__m64 __a)
    +{
    +  return __builtin_ia32_cvtpi2pd((__v2si)__a);
    +}
    +
    +/// Returns the low-order element of a 128-bit vector of [2 x double] as
    +///    a double-precision floating-point value.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic has no corresponding instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double]. The lower 64 bits are returned.
    +/// \returns A double-precision floating-point value copied from the lower 
64
    +///    bits of \a __a.
    +static __inline__ double __DEFAULT_FN_ATTRS
    +_mm_cvtsd_f64(__m128d __a)
    +{
    +  return __a[0];
    +}
    +
    +/// Loads a 128-bit floating-point vector of [2 x double] from an aligned
    +///    memory location.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> 
instruction.
    +///
    +/// \param __dp
    +///    A pointer to a 128-bit memory location. The address of the memory
    +///    location has to be 16-byte aligned.
    +/// \returns A 128-bit vector of [2 x double] containing the loaded values.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_load_pd(double const *__dp)
    +{
    +  return *(__m128d*)__dp;
    +}
    +
    +/// Loads a double-precision floating-point value from a specified memory
    +///    location and duplicates it to both vector elements of a 128-bit 
vector of
    +///    [2 x double].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVDDUP / MOVDDUP </c> 
instruction.
    +///
    +/// \param __dp
    +///    A pointer to a memory location containing a double-precision value.
    +/// \returns A 128-bit vector of [2 x double] containing the loaded and
    +///    duplicated values.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_load1_pd(double const *__dp)
    +{
    +  struct __mm_load1_pd_struct {
    +    double __u;
    +  } __attribute__((__packed__, __may_alias__));
    +  double __u = ((struct __mm_load1_pd_struct*)__dp)->__u;
    +  return __extension__ (__m128d){ __u, __u };
    +}
    +
    +#define        _mm_load_pd1(dp)        _mm_load1_pd(dp)
    +
    +/// Loads two double-precision values, in reverse order, from an aligned
    +///    memory location into a 128-bit vector of [2 x double].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> 
instruction +
    +/// needed shuffling instructions. In AVX mode, the shuffling may be 
combined
    +/// with the \c VMOVAPD, resulting in only a \c VPERMILPD instruction.
    +///
    +/// \param __dp
    +///    A 16-byte aligned pointer to an array of double-precision values to 
be
    +///    loaded in reverse order.
    +/// \returns A 128-bit vector of [2 x double] containing the reversed 
loaded
    +///    values.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_loadr_pd(double const *__dp)
    +{
    +#ifdef __GNUC__
    +  __m128d __tmp = _mm_load_pd (__dp);
    +  return __builtin_ia32_shufpd (__tmp, __tmp, _MM_SHUFFLE2 (0,1));
    +#else
    +  __m128d __u = *(__m128d*)__dp;
    +  return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0);
    +#endif
    +}
    +
    +/// Loads a 128-bit floating-point vector of [2 x double] from an
    +///    unaligned memory location.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> 
instruction.
    +///
    +/// \param __dp
    +///    A pointer to a 128-bit memory location. The address of the memory
    +///    location does not have to be aligned.
    +/// \returns A 128-bit vector of [2 x double] containing the loaded values.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_loadu_pd(double const *__dp)
    +{
    +  struct __loadu_pd {
    +    __m128d __v;
    +  } __attribute__((__packed__, __may_alias__));
    +  return ((struct __loadu_pd*)__dp)->__v;
    +}
    +
    +/// Loads a 64-bit integer value to the low element of a 128-bit integer
    +///    vector and clears the upper element.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
    +///
    +/// \param __a
    +///    A pointer to a 64-bit memory location. The address of the memory
    +///    location does not have to be aligned.
    +/// \returns A 128-bit vector of [2 x i64] containing the loaded value.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_loadu_si64(void const *__a)
    +{
    +  struct __loadu_si64 {
    +    long long __v;
    +  } __attribute__((__packed__, __may_alias__));
    +  long long __u = ((struct __loadu_si64*)__a)->__v;
    +  return __extension__ (__m128i)(__v2di){__u, 0L};
    +}
    +
    +/// Loads a 64-bit double-precision value to the low element of a
    +///    128-bit integer vector and clears the upper element.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
    +///
    +/// \param __dp
    +///    A pointer to a memory location containing a double-precision value.
    +///    The address of the memory location does not have to be aligned.
    +/// \returns A 128-bit vector of [2 x double] containing the loaded value.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_load_sd(double const *__dp)
    +{
    +  struct __mm_load_sd_struct {
    +    double __u;
    +  } __attribute__((__packed__, __may_alias__));
    +  double __u = ((struct __mm_load_sd_struct*)__dp)->__u;
    +  return __extension__ (__m128d){ __u, 0 };
    +}
    +
    +/// Loads a double-precision value into the high-order bits of a 128-bit
    +///    vector of [2 x double]. The low-order bits are copied from the 
low-order
    +///    bits of the first operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double]. \n
    +///    Bits [63:0] are written to bits [63:0] of the result.
    +/// \param __dp
    +///    A pointer to a 64-bit memory location containing a double-precision
    +///    floating-point value that is loaded. The loaded value is written to 
bits
    +///    [127:64] of the result. The address of the memory location does not 
have
    +///    to be aligned.
    +/// \returns A 128-bit vector of [2 x double] containing the moved values.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_loadh_pd(__m128d __a, double const *__dp)
    +{
    +  struct __mm_loadh_pd_struct {
    +    double __u;
    +  } __attribute__((__packed__, __may_alias__));
    +  double __u = ((struct __mm_loadh_pd_struct*)__dp)->__u;
    +  return __extension__ (__m128d){ __a[0], __u };
    +}
    +
    +/// Loads a double-precision value into the low-order bits of a 128-bit
    +///    vector of [2 x double]. The high-order bits are copied from the
    +///    high-order bits of the first operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double]. \n
    +///    Bits [127:64] are written to bits [127:64] of the result.
    +/// \param __dp
    +///    A pointer to a 64-bit memory location containing a double-precision
    +///    floating-point value that is loaded. The loaded value is written to 
bits
    +///    [63:0] of the result. The address of the memory location does not 
have to
    +///    be aligned.
    +/// \returns A 128-bit vector of [2 x double] containing the moved values.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_loadl_pd(__m128d __a, double const *__dp)
    +{
    +  struct __mm_loadl_pd_struct {
    +    double __u;
    +  } __attribute__((__packed__, __may_alias__));
    +  double __u = ((struct __mm_loadl_pd_struct*)__dp)->__u;
    +  return __extension__ (__m128d){ __u, __a[1] };
    +}
    +
    +/// Constructs a 128-bit floating-point vector of [2 x double] with
    +///    unspecified content. This could be used as an argument to another
    +///    intrinsic function where the argument is required but the value is 
not
    +///    actually used.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic has no corresponding instruction.
    +///
    +/// \returns A 128-bit floating-point vector of [2 x double] with 
unspecified
    +///    content.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_undefined_pd(void)
    +{
    +#ifdef __GNUC__
    +  __m128d __X = __X;
    +  return __X;
    +#else
    +  return (__m128d)__builtin_ia32_undef128();
    +#endif
    +}
    +
    +/// Constructs a 128-bit floating-point vector of [2 x double]. The lower
    +///    64 bits of the vector are initialized with the specified 
double-precision
    +///    floating-point value. The upper 64 bits are set to zero.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
    +///
    +/// \param __w
    +///    A double-precision floating-point value used to initialize the 
lower 64
    +///    bits of the result.
    +/// \returns An initialized 128-bit floating-point vector of [2 x double]. 
The
    +///    lower 64 bits contain the value of the parameter. The upper 64 bits 
are
    +///    set to zero.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_set_sd(double __w)
    +{
    +  return __extension__ (__m128d){ __w, 0 };
    +}
    +
    +/// Constructs a 128-bit floating-point vector of [2 x double], with each
    +///    of the two double-precision floating-point vector elements set to 
the
    +///    specified double-precision floating-point value.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> 
instruction.
    +///
    +/// \param __w
    +///    A double-precision floating-point value used to initialize each 
vector
    +///    element of the result.
    +/// \returns An initialized 128-bit floating-point vector of [2 x double].
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_set1_pd(double __w)
    +{
    +  return __extension__ (__m128d){ __w, __w };
    +}
    +
    +/// Constructs a 128-bit floating-point vector of [2 x double], with each
    +///    of the two double-precision floating-point vector elements set to 
the
    +///    specified double-precision floating-point value.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> 
instruction.
    +///
    +/// \param __w
    +///    A double-precision floating-point value used to initialize each 
vector
    +///    element of the result.
    +/// \returns An initialized 128-bit floating-point vector of [2 x double].
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_set_pd1(double __w)
    +{
    +  return _mm_set1_pd(__w);
    +}
    +
    +/// Constructs a 128-bit floating-point vector of [2 x double]
    +///    initialized with the specified double-precision floating-point 
values.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> 
instruction.
    +///
    +/// \param __w
    +///    A double-precision floating-point value used to initialize the 
upper 64
    +///    bits of the result.
    +/// \param __x
    +///    A double-precision floating-point value used to initialize the 
lower 64
    +///    bits of the result.
    +/// \returns An initialized 128-bit floating-point vector of [2 x double].
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_set_pd(double __w, double __x)
    +{
    +  return __extension__ (__m128d){ __x, __w };
    +}
    +
    +/// Constructs a 128-bit floating-point vector of [2 x double],
    +///    initialized in reverse order with the specified double-precision
    +///    floating-point values.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> 
instruction.
    +///
    +/// \param __w
    +///    A double-precision floating-point value used to initialize the 
lower 64
    +///    bits of the result.
    +/// \param __x
    +///    A double-precision floating-point value used to initialize the 
upper 64
    +///    bits of the result.
    +/// \returns An initialized 128-bit floating-point vector of [2 x double].
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_setr_pd(double __w, double __x)
    +{
    +  return __extension__ (__m128d){ __w, __x };
    +}
    +
    +/// Constructs a 128-bit floating-point vector of [2 x double]
    +///    initialized to zero.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
    +///
    +/// \returns An initialized 128-bit floating-point vector of [2 x double] 
with
    +///    all elements set to zero.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_setzero_pd(void)
    +{
    +  return __extension__ (__m128d){ 0, 0 };
    +}
    +
    +/// Constructs a 128-bit floating-point vector of [2 x double]. The lower
    +///    64 bits are set to the lower 64 bits of the second parameter. The 
upper
    +///    64 bits are set to the upper 64 bits of the first parameter.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double]. The upper 64 bits are written to 
the
    +///    upper 64 bits of the result.
    +/// \param __b
    +///    A 128-bit vector of [2 x double]. The lower 64 bits are written to 
the
    +///    lower 64 bits of the result.
    +/// \returns A 128-bit vector of [2 x double] containing the moved values.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_move_sd(__m128d __a, __m128d __b)
    +{
    +  __a[0] = __b[0];
    +  return __a;
    +}
    +
    +/// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
    +///    memory location.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
    +///
    +/// \param __dp
    +///    A pointer to a 64-bit memory location.
    +/// \param __a
    +///    A 128-bit vector of [2 x double] containing the value to be stored.
    +static __inline__ void __DEFAULT_FN_ATTRS
    +_mm_store_sd(double *__dp, __m128d __a)
    +{
    +  struct __mm_store_sd_struct {
    +    double __u;
    +  } __attribute__((__packed__, __may_alias__));
    +  ((struct __mm_store_sd_struct*)__dp)->__u = __a[0];
    +}
    +
    +/// Moves packed double-precision values from a 128-bit vector of
    +///    [2 x double] to a memory location.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c>VMOVAPD / MOVAPS</c> instruction.
    +///
    +/// \param __dp
    +///    A pointer to an aligned memory location that can store two
    +///    double-precision values.
    +/// \param __a
    +///    A packed 128-bit vector of [2 x double] containing the values to be
    +///    moved.
    +static __inline__ void __DEFAULT_FN_ATTRS
    +_mm_store_pd(double *__dp, __m128d __a)
    +{
    +  *(__m128d*)__dp = __a;
    +}
    +
    +/// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
    +///    the upper and lower 64 bits of a memory location.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the
    +///   <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
    +///
    +/// \param __dp
    +///    A pointer to a memory location that can store two double-precision
    +///    values.
    +/// \param __a
    +///    A 128-bit vector of [2 x double] whose lower 64 bits are copied to 
each
    +///    of the values in \a __dp.
    +static __inline__ void __DEFAULT_FN_ATTRS
    +_mm_store1_pd(double *__dp, __m128d __a)
    +{
    +#ifdef __GNUC__
    +  _mm_store_pd (__dp, __builtin_ia32_shufpd (__a, __a, _MM_SHUFFLE2 
(0,0)));
    +#else
    +  __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
    +  _mm_store_pd(__dp, __a);
    +#endif
    +}
    +
    +/// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
    +///    the upper and lower 64 bits of a memory location.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the
    +///   <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
    +///
    +/// \param __dp
    +///    A pointer to a memory location that can store two double-precision
    +///    values.
    +/// \param __a
    +///    A 128-bit vector of [2 x double] whose lower 64 bits are copied to 
each
    +///    of the values in \a __dp.
    +static __inline__ void __DEFAULT_FN_ATTRS
    +_mm_store_pd1(double *__dp, __m128d __a)
    +{
    +  _mm_store1_pd(__dp, __a);
    +}
    +
    +/// Stores a 128-bit vector of [2 x double] into an unaligned memory
    +///    location.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> 
instruction.
    +///
    +/// \param __dp
    +///    A pointer to a 128-bit memory location. The address of the memory
    +///    location does not have to be aligned.
    +/// \param __a
    +///    A 128-bit vector of [2 x double] containing the values to be stored.
    +static __inline__ void __DEFAULT_FN_ATTRS
    +_mm_storeu_pd(double *__dp, __m128d __a)
    +{
    +  struct __storeu_pd {
    +    __m128d __v;
    +  } __attribute__((__packed__, __may_alias__));
    +  ((struct __storeu_pd*)__dp)->__v = __a;
    +}
    +
    +/// Stores two double-precision values, in reverse order, from a 128-bit
    +///    vector of [2 x double] to a 16-byte aligned memory location.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to a shuffling instruction followed by a
    +/// <c> VMOVAPD / MOVAPD </c> instruction.
    +///
    +/// \param __dp
    +///    A pointer to a 16-byte aligned memory location that can store two
    +///    double-precision values.
    +/// \param __a
    +///    A 128-bit vector of [2 x double] containing the values to be 
reversed and
    +///    stored.
    +static __inline__ void __DEFAULT_FN_ATTRS
    +_mm_storer_pd(double *__dp, __m128d __a)
    +{
    +#ifdef __GNUC__
    +  _mm_store_pd (__dp, __builtin_ia32_shufpd (__a, __a, _MM_SHUFFLE2 
(0,1)));
    +#else
    +  __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0);
    +  *(__m128d *)__dp = __a;
    +#endif
    +}
    +
    +/// Stores the upper 64 bits of a 128-bit vector of [2 x double] to a
    +///    memory location.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> 
instruction.
    +///
    +/// \param __dp
    +///    A pointer to a 64-bit memory location.
    +/// \param __a
    +///    A 128-bit vector of [2 x double] containing the value to be stored.
    +static __inline__ void __DEFAULT_FN_ATTRS
    +_mm_storeh_pd(double *__dp, __m128d __a)
    +{
    +  struct __mm_storeh_pd_struct {
    +    double __u;
    +  } __attribute__((__packed__, __may_alias__));
    +  ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1];
    +}
    +
    +/// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
    +///    memory location.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> 
instruction.
    +///
    +/// \param __dp
    +///    A pointer to a 64-bit memory location.
    +/// \param __a
    +///    A 128-bit vector of [2 x double] containing the value to be stored.
    +static __inline__ void __DEFAULT_FN_ATTRS
    +_mm_storel_pd(double *__dp, __m128d __a)
    +{
    +  struct __mm_storeh_pd_struct {
    +    double __u;
    +  } __attribute__((__packed__, __may_alias__));
    +  ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0];
    +}
    +
    +/// Adds the corresponding elements of two 128-bit vectors of [16 x i8],
    +///    saving the lower 8 bits of each sum in the corresponding element of 
a
    +///    128-bit result vector of [16 x i8].
    +///
    +///    The integer elements of both parameters can be either signed or 
unsigned.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPADDB / PADDB </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [16 x i8].
    +/// \param __b
    +///    A 128-bit vector of [16 x i8].
    +/// \returns A 128-bit vector of [16 x i8] containing the sums of both
    +///    parameters.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_add_epi8(__m128i __a, __m128i __b)
    +{
    +  return (__m128i)((__v16qu)__a + (__v16qu)__b);
    +}
    +
    +/// Adds the corresponding elements of two 128-bit vectors of [8 x i16],
    +///    saving the lower 16 bits of each sum in the corresponding element 
of a
    +///    128-bit result vector of [8 x i16].
    +///
    +///    The integer elements of both parameters can be either signed or 
unsigned.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPADDW / PADDW </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [8 x i16].
    +/// \param __b
    +///    A 128-bit vector of [8 x i16].
    +/// \returns A 128-bit vector of [8 x i16] containing the sums of both
    +///    parameters.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_add_epi16(__m128i __a, __m128i __b)
    +{
    +  return (__m128i)((__v8hu)__a + (__v8hu)__b);
    +}
    +
    +/// Adds the corresponding elements of two 128-bit vectors of [4 x i32],
    +///    saving the lower 32 bits of each sum in the corresponding element 
of a
    +///    128-bit result vector of [4 x i32].
    +///
    +///    The integer elements of both parameters can be either signed or 
unsigned.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPADDD / PADDD </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x i32].
    +/// \param __b
    +///    A 128-bit vector of [4 x i32].
    +/// \returns A 128-bit vector of [4 x i32] containing the sums of both
    +///    parameters.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_add_epi32(__m128i __a, __m128i __b)
    +{
    +  return (__m128i)((__v4su)__a + (__v4su)__b);
    +}
    +
    +/// Adds two signed or unsigned 64-bit integer values, returning the
    +///    lower 64 bits of the sum.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PADDQ </c> instruction.
    +///
    +/// \param __a
    +///    A 64-bit integer.
    +/// \param __b
    +///    A 64-bit integer.
    +/// \returns A 64-bit integer containing the sum of both parameters.
    +#ifndef __GNUC__
    +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
    +_mm_add_si64(__m64 __a, __m64 __b)
    +{
    +  return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b);
    +}
    +#endif
    +
    +/// Adds the corresponding elements of two 128-bit vectors of [2 x i64],
    +///    saving the lower 64 bits of each sum in the corresponding element 
of a
    +///    128-bit result vector of [2 x i64].
    +///
    +///    The integer elements of both parameters can be either signed or 
unsigned.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPADDQ / PADDQ </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x i64].
    +/// \param __b
    +///    A 128-bit vector of [2 x i64].
    +/// \returns A 128-bit vector of [2 x i64] containing the sums of both
    +///    parameters.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_add_epi64(__m128i __a, __m128i __b)
    +{
    +  return (__m128i)((__v2du)__a + (__v2du)__b);
    +}
    +
    +/// Adds, with saturation, the corresponding elements of two 128-bit
    +///    signed [16 x i8] vectors, saving each sum in the corresponding 
element of
    +///    a 128-bit result vector of [16 x i8]. Positive sums greater than 
0x7F are
    +///    saturated to 0x7F. Negative sums less than 0x80 are saturated to 
0x80.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPADDSB / PADDSB </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit signed [16 x i8] vector.
    +/// \param __b
    +///    A 128-bit signed [16 x i8] vector.
    +/// \returns A 128-bit signed [16 x i8] vector containing the saturated 
sums of
    +///    both parameters.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_adds_epi8(__m128i __a, __m128i __b)
    +{
    +  return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b);
    +}
    +
    +/// Adds, with saturation, the corresponding elements of two 128-bit
    +///    signed [8 x i16] vectors, saving each sum in the corresponding 
element of
    +///    a 128-bit result vector of [8 x i16]. Positive sums greater than 
0x7FFF
    +///    are saturated to 0x7FFF. Negative sums less than 0x8000 are 
saturated to
    +///    0x8000.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPADDSW / PADDSW </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit signed [8 x i16] vector.
    +/// \param __b
    +///    A 128-bit signed [8 x i16] vector.
    +/// \returns A 128-bit signed [8 x i16] vector containing the saturated 
sums of
    +///    both parameters.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_adds_epi16(__m128i __a, __m128i __b)
    +{
    +  return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b);
    +}
    +
    +/// Adds, with saturation, the corresponding elements of two 128-bit
    +///    unsigned [16 x i8] vectors, saving each sum in the corresponding 
element
    +///    of a 128-bit result vector of [16 x i8]. Positive sums greater than 
0xFF
    +///    are saturated to 0xFF. Negative sums are saturated to 0x00.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit unsigned [16 x i8] vector.
    +/// \param __b
    +///    A 128-bit unsigned [16 x i8] vector.
    +/// \returns A 128-bit unsigned [16 x i8] vector containing the saturated 
sums
    +///    of both parameters.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_adds_epu8(__m128i __a, __m128i __b)
    +{
    +  return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b);
    +}
    +
    +/// Adds, with saturation, the corresponding elements of two 128-bit
    +///    unsigned [8 x i16] vectors, saving each sum in the corresponding 
element
    +///    of a 128-bit result vector of [8 x i16]. Positive sums greater than
    +///    0xFFFF are saturated to 0xFFFF. Negative sums are saturated to 
0x0000.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit unsigned [8 x i16] vector.
    +/// \param __b
    +///    A 128-bit unsigned [8 x i16] vector.
    +/// \returns A 128-bit unsigned [8 x i16] vector containing the saturated 
sums
    +///    of both parameters.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_adds_epu16(__m128i __a, __m128i __b)
    +{
    +  return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b);
    +}
    +
    +/// Computes the rounded avarages of corresponding elements of two
    +///    128-bit unsigned [16 x i8] vectors, saving each result in the
    +///    corresponding element of a 128-bit result vector of [16 x i8].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPAVGB / PAVGB </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit unsigned [16 x i8] vector.
    +/// \param __b
    +///    A 128-bit unsigned [16 x i8] vector.
    +/// \returns A 128-bit unsigned [16 x i8] vector containing the rounded
    +///    averages of both parameters.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_avg_epu8(__m128i __a, __m128i __b)
    +{
    +  typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
    +  return (__m128i)__builtin_convertvector(
    +               ((__builtin_convertvector((__v16qu)__a, __v16hu) +
    +                 __builtin_convertvector((__v16qu)__b, __v16hu)) + 1)
    +                 >> 1, __v16qu);
    +}
    +
    +/// Computes the rounded avarages of corresponding elements of two
    +///    128-bit unsigned [8 x i16] vectors, saving each result in the
    +///    corresponding element of a 128-bit result vector of [8 x i16].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPAVGW / PAVGW </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit unsigned [8 x i16] vector.
    +/// \param __b
    +///    A 128-bit unsigned [8 x i16] vector.
    +/// \returns A 128-bit unsigned [8 x i16] vector containing the rounded
    +///    averages of both parameters.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_avg_epu16(__m128i __a, __m128i __b)
    +{
    +  typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
    +  return (__m128i)__builtin_convertvector(
    +               ((__builtin_convertvector((__v8hu)__a, __v8su) +
    +                 __builtin_convertvector((__v8hu)__b, __v8su)) + 1)
    +                 >> 1, __v8hu);
    +}
    +
    +/// Multiplies the corresponding elements of two 128-bit signed [8 x i16]
    +///    vectors, producing eight intermediate 32-bit signed integer 
products, and
    +///    adds the consecutive pairs of 32-bit products to form a 128-bit 
signed
    +///    [4 x i32] vector.
    +///
    +///    For example, bits [15:0] of both parameters are multiplied 
producing a
    +///    32-bit product, bits [31:16] of both parameters are multiplied 
producing
    +///    a 32-bit product, and the sum of those two products becomes bits 
[31:0]
    +///    of the result.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPMADDWD / PMADDWD </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit signed [8 x i16] vector.
    +/// \param __b
    +///    A 128-bit signed [8 x i16] vector.
    +/// \returns A 128-bit signed [4 x i32] vector containing the sums of 
products
    +///    of both parameters.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_madd_epi16(__m128i __a, __m128i __b)
    +{
    +  return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
    +}
    +
    +/// Compares corresponding elements of two 128-bit signed [8 x i16]
    +///    vectors, saving the greater value from each comparison in the
    +///    corresponding element of a 128-bit result vector of [8 x i16].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPMAXSW / PMAXSW </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit signed [8 x i16] vector.
    +/// \param __b
    +///    A 128-bit signed [8 x i16] vector.
    +/// \returns A 128-bit signed [8 x i16] vector containing the greater 
value of
    +///    each comparison.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_max_epi16(__m128i __a, __m128i __b)
    +{
    +  return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b);
    +}
    +
    +/// Compares corresponding elements of two 128-bit unsigned [16 x i8]
    +///    vectors, saving the greater value from each comparison in the
    +///    corresponding element of a 128-bit result vector of [16 x i8].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPMAXUB / PMAXUB </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit unsigned [16 x i8] vector.
    +/// \param __b
    +///    A 128-bit unsigned [16 x i8] vector.
    +/// \returns A 128-bit unsigned [16 x i8] vector containing the greater 
value of
    +///    each comparison.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_max_epu8(__m128i __a, __m128i __b)
    +{
    +  return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b);
    +}
    +
    +/// Compares corresponding elements of two 128-bit signed [8 x i16]
    +///    vectors, saving the smaller value from each comparison in the
    +///    corresponding element of a 128-bit result vector of [8 x i16].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPMINSW / PMINSW </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit signed [8 x i16] vector.
    +/// \param __b
    +///    A 128-bit signed [8 x i16] vector.
    +/// \returns A 128-bit signed [8 x i16] vector containing the smaller 
value of
    +///    each comparison.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_min_epi16(__m128i __a, __m128i __b)
    +{
    +  return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b);
    +}
    +
    +/// Compares corresponding elements of two 128-bit unsigned [16 x i8]
    +///    vectors, saving the smaller value from each comparison in the
    +///    corresponding element of a 128-bit result vector of [16 x i8].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPMINUB / PMINUB </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit unsigned [16 x i8] vector.
    +/// \param __b
    +///    A 128-bit unsigned [16 x i8] vector.
    +/// \returns A 128-bit unsigned [16 x i8] vector containing the smaller 
value of
    +///    each comparison.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_min_epu8(__m128i __a, __m128i __b)
    +{
    +  return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b);
    +}
    +
    +/// Multiplies the corresponding elements of two signed [8 x i16]
    +///    vectors, saving the upper 16 bits of each 32-bit product in the
    +///    corresponding element of a 128-bit signed [8 x i16] result vector.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPMULHW / PMULHW </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit signed [8 x i16] vector.
    +/// \param __b
    +///    A 128-bit signed [8 x i16] vector.
    +/// \returns A 128-bit signed [8 x i16] vector containing the upper 16 
bits of
    +///    each of the eight 32-bit products.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_mulhi_epi16(__m128i __a, __m128i __b)
    +{
    +  return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
    +}
    +
    +/// Multiplies the corresponding elements of two unsigned [8 x i16]
    +///    vectors, saving the upper 16 bits of each 32-bit product in the
    +///    corresponding element of a 128-bit unsigned [8 x i16] result vector.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPMULHUW / PMULHUW </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit unsigned [8 x i16] vector.
    +/// \param __b
    +///    A 128-bit unsigned [8 x i16] vector.
    +/// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 
bits
    +///    of each of the eight 32-bit products.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_mulhi_epu16(__m128i __a, __m128i __b)
    +{
    +  return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
    +}
    +
    +/// Multiplies the corresponding elements of two signed [8 x i16]
    +///    vectors, saving the lower 16 bits of each 32-bit product in the
    +///    corresponding element of a 128-bit signed [8 x i16] result vector.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPMULLW / PMULLW </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit signed [8 x i16] vector.
    +/// \param __b
    +///    A 128-bit signed [8 x i16] vector.
    +/// \returns A 128-bit signed [8 x i16] vector containing the lower 16 
bits of
    +///    each of the eight 32-bit products.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_mullo_epi16(__m128i __a, __m128i __b)
    +{
    +  return (__m128i)((__v8hu)__a * (__v8hu)__b);
    +}
    +
    +/// Multiplies 32-bit unsigned integer values contained in the lower bits
    +///    of the two 64-bit integer vectors and returns the 64-bit unsigned
    +///    product.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PMULUDQ </c> instruction.
    +///
    +/// \param __a
    +///    A 64-bit integer containing one of the source operands.
    +/// \param __b
    +///    A 64-bit integer containing one of the source operands.
    +/// \returns A 64-bit integer vector containing the product of both 
operands.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
    +_mm_mul_su32(__m64 __a, __m64 __b)
    +{
    +#ifdef __GNUC__
    +  return (__m64)__builtin_ia32_pmuludq ((__v2si)__a, (__v2si)__b);
    +#else
    +  return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
    +#endif
    +}
    +
    +/// Multiplies 32-bit unsigned integer values contained in the lower
    +///    bits of the corresponding elements of two [2 x i64] vectors, and 
returns
    +///    the 64-bit products in the corresponding elements of a [2 x i64] 
vector.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPMULUDQ / PMULUDQ </c> 
instruction.
    +///
    +/// \param __a
    +///    A [2 x i64] vector containing one of the source operands.
    +/// \param __b
    +///    A [2 x i64] vector containing one of the source operands.
    +/// \returns A [2 x i64] vector containing the product of both operands.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_mul_epu32(__m128i __a, __m128i __b)
    +{
    +  return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
    +}
    +
    +/// Computes the absolute differences of corresponding 8-bit integer
    +///    values in two 128-bit vectors. Sums the first 8 absolute 
differences, and
    +///    separately sums the second 8 absolute differences. Packs these two
    +///    unsigned 16-bit integer sums into the upper and lower elements of a
    +///    [2 x i64] vector.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPSADBW / PSADBW </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit integer vector containing one of the source operands.
    +/// \param __b
    +///    A 128-bit integer vector containing one of the source operands.
    +/// \returns A [2 x i64] vector containing the sums of the sets of absolute
    +///    differences between both operands.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_sad_epu8(__m128i __a, __m128i __b)
    +{
    +  return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
    +}
    +
    +/// Subtracts the corresponding 8-bit integer values in the operands.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPSUBB / PSUBB </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit integer vector containing the minuends.
    +/// \param __b
    +///    A 128-bit integer vector containing the subtrahends.
    +/// \returns A 128-bit integer vector containing the differences of the 
values
    +///    in the operands.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_sub_epi8(__m128i __a, __m128i __b)
    +{
    +  return (__m128i)((__v16qu)__a - (__v16qu)__b);
    +}
    +
    +/// Subtracts the corresponding 16-bit integer values in the operands.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPSUBW / PSUBW </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit integer vector containing the minuends.
    +/// \param __b
    +///    A 128-bit integer vector containing the subtrahends.
    +/// \returns A 128-bit integer vector containing the differences of the 
values
    +///    in the operands.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_sub_epi16(__m128i __a, __m128i __b)
    +{
    +  return (__m128i)((__v8hu)__a - (__v8hu)__b);
    +}
    +
    +/// Subtracts the corresponding 32-bit integer values in the operands.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPSUBD / PSUBD </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit integer vector containing the minuends.
    +/// \param __b
    +///    A 128-bit integer vector containing the subtrahends.
    +/// \returns A 128-bit integer vector containing the differences of the 
values
    +///    in the operands.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_sub_epi32(__m128i __a, __m128i __b)
    +{
    +  return (__m128i)((__v4su)__a - (__v4su)__b);
    +}
    +
    +/// Subtracts signed or unsigned 64-bit integer values and writes the
    +///    difference to the corresponding bits in the destination.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PSUBQ </c> instruction.
    +///
    +/// \param __a
    +///    A 64-bit integer vector containing the minuend.
    +/// \param __b
    +///    A 64-bit integer vector containing the subtrahend.
    +/// \returns A 64-bit integer vector containing the difference of the 
values in
    +///    the operands.
    +
    +#ifndef __GNUC__
    +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
    +_mm_sub_si64(__m64 __a, __m64 __b)
    +{
    +  return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b);
    +}
    +#endif
    +
    +/// Subtracts the corresponding elements of two [2 x i64] vectors.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPSUBQ / PSUBQ </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit integer vector containing the minuends.
    +/// \param __b
    +///    A 128-bit integer vector containing the subtrahends.
    +/// \returns A 128-bit integer vector containing the differences of the 
values
    +///    in the operands.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_sub_epi64(__m128i __a, __m128i __b)
    +{
    +  return (__m128i)((__v2du)__a - (__v2du)__b);
    +}
    +
    +/// Subtracts corresponding 8-bit signed integer values in the input and
    +///    returns the differences in the corresponding bytes in the 
destination.
    +///    Differences greater than 0x7F are saturated to 0x7F, and 
differences less
    +///    than 0x80 are saturated to 0x80.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPSUBSB / PSUBSB </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit integer vector containing the minuends.
    +/// \param __b
    +///    A 128-bit integer vector containing the subtrahends.
    +/// \returns A 128-bit integer vector containing the differences of the 
values
    +///    in the operands.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_subs_epi8(__m128i __a, __m128i __b)
    +{
    +  return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b);
    +}
    +
    +/// Subtracts corresponding 16-bit signed integer values in the input and
    +///    returns the differences in the corresponding bytes in the 
destination.
    +///    Differences greater than 0x7FFF are saturated to 0x7FFF, and values 
less
    +///    than 0x8000 are saturated to 0x8000.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPSUBSW / PSUBSW </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit integer vector containing the minuends.
    +/// \param __b
    +///    A 128-bit integer vector containing the subtrahends.
    +/// \returns A 128-bit integer vector containing the differences of the 
values
    +///    in the operands.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_subs_epi16(__m128i __a, __m128i __b)
    +{
    +  return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b);
    +}
    +
    +/// Subtracts corresponding 8-bit unsigned integer values in the input
    +///    and returns the differences in the corresponding bytes in the
    +///    destination. Differences less than 0x00 are saturated to 0x00.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPSUBUSB / PSUBUSB </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit integer vector containing the minuends.
    +/// \param __b
    +///    A 128-bit integer vector containing the subtrahends.
    +/// \returns A 128-bit integer vector containing the unsigned integer
    +///    differences of the values in the operands.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_subs_epu8(__m128i __a, __m128i __b)
    +{
    +  return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b);
    +}
    +
    +/// Subtracts corresponding 16-bit unsigned integer values in the input
    +///    and returns the differences in the corresponding bytes in the
    +///    destination. Differences less than 0x0000 are saturated to 0x0000.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPSUBUSW / PSUBUSW </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit integer vector containing the minuends.
    +/// \param __b
    +///    A 128-bit integer vector containing the subtrahends.
    +/// \returns A 128-bit integer vector containing the unsigned integer
    +///    differences of the values in the operands.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_subs_epu16(__m128i __a, __m128i __b)
    +{
    +  return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b);
    +}
    +
    +/// Performs a bitwise AND of two 128-bit integer vectors.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit integer vector containing one of the source operands.
    +/// \param __b
    +///    A 128-bit integer vector containing one of the source operands.
    +/// \returns A 128-bit integer vector containing the bitwise AND of the 
values
    +///    in both operands.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_and_si128(__m128i __a, __m128i __b)
    +{
    +  return (__m128i)((__v2du)__a & (__v2du)__b);
    +}
    +
    +/// Performs a bitwise AND of two 128-bit integer vectors, using the
    +///    one's complement of the values contained in the first source 
operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector containing the left source operand. The one's 
complement
    +///    of this value is used in the bitwise AND.
    +/// \param __b
    +///    A 128-bit vector containing the right source operand.
    +/// \returns A 128-bit integer vector containing the bitwise AND of the 
one's
    +///    complement of the first operand and the values in the second 
operand.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_andnot_si128(__m128i __a, __m128i __b)
    +{
    +  return (__m128i)(~(__v2du)__a & (__v2du)__b);
    +}
    +/// Performs a bitwise OR of two 128-bit integer vectors.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit integer vector containing one of the source operands.
    +/// \param __b
    +///    A 128-bit integer vector containing one of the source operands.
    +/// \returns A 128-bit integer vector containing the bitwise OR of the 
values
    +///    in both operands.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_or_si128(__m128i __a, __m128i __b)
    +{
    +  return (__m128i)((__v2du)__a | (__v2du)__b);
    +}
    +
    +/// Performs a bitwise exclusive OR of two 128-bit integer vectors.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit integer vector containing one of the source operands.
    +/// \param __b
    +///    A 128-bit integer vector containing one of the source operands.
    +/// \returns A 128-bit integer vector containing the bitwise exclusive OR 
of the
    +///    values in both operands.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_xor_si128(__m128i __a, __m128i __b)
    +{
    +  return (__m128i)((__v2du)__a ^ (__v2du)__b);
    +}
    +
    +/// Left-shifts the 128-bit integer vector operand by the specified
    +///    number of bytes. Low-order bits are cleared.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m128i _mm_slli_si128(__m128i a, const int imm);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VPSLLDQ / PSLLDQ </c> 
instruction.
    +///
    +/// \param a
    +///    A 128-bit integer vector containing the source operand.
    +/// \param imm
    +///    An immediate value specifying the number of bytes to left-shift 
operand
    +///    \a a.
    +/// \returns A 128-bit integer vector containing the left-shifted value.
    +#define _mm_slli_si128(a, imm) \
    +  (__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), 
(int)(imm))
    +
    +#define _mm_bslli_si128(a, imm) \
    +  (__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), 
(int)(imm))
    +
    +/// Left-shifts each 16-bit value in the 128-bit integer vector operand
    +///    by the specified number of bits. Low-order bits are cleared.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit integer vector containing the source operand.
    +/// \param __count
    +///    An integer value specifying the number of bits to left-shift each 
value
    +///    in operand \a __a.
    +/// \returns A 128-bit integer vector containing the left-shifted values.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_slli_epi16(__m128i __a, int __count)
    +{
    +  return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
    +}
    +
    +/// Left-shifts each 16-bit value in the 128-bit integer vector operand
    +///    by the specified number of bits. Low-order bits are cleared.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit integer vector containing the source operand.
    +/// \param __count
    +///    A 128-bit integer vector in which bits [63:0] specify the number of 
bits
    +///    to left-shift each value in operand \a __a.
    +/// \returns A 128-bit integer vector containing the left-shifted values.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_sll_epi16(__m128i __a, __m128i __count)
    +{
    +  return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
    +}
    +
    +/// Left-shifts each 32-bit value in the 128-bit integer vector operand
    +///    by the specified number of bits. Low-order bits are cleared.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit integer vector containing the source operand.
    +/// \param __count
    +///    An integer value specifying the number of bits to left-shift each 
value
    +///    in operand \a __a.
    +/// \returns A 128-bit integer vector containing the left-shifted values.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_slli_epi32(__m128i __a, int __count)
    +{
    +  return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
    +}
    +
    +/// Left-shifts each 32-bit value in the 128-bit integer vector operand
    +///    by the specified number of bits. Low-order bits are cleared.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit integer vector containing the source operand.
    +/// \param __count
    +///    A 128-bit integer vector in which bits [63:0] specify the number of 
bits
    +///    to left-shift each value in operand \a __a.
    +/// \returns A 128-bit integer vector containing the left-shifted values.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_sll_epi32(__m128i __a, __m128i __count)
    +{
    +  return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
    +}
    +
    +/// Left-shifts each 64-bit value in the 128-bit integer vector operand
    +///    by the specified number of bits. Low-order bits are cleared.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit integer vector containing the source operand.
    +/// \param __count
    +///    An integer value specifying the number of bits to left-shift each 
value
    +///    in operand \a __a.
    +/// \returns A 128-bit integer vector containing the left-shifted values.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_slli_epi64(__m128i __a, int __count)
    +{
    +  return __builtin_ia32_psllqi128((__v2di)__a, __count);
    +}
    +
    +/// Left-shifts each 64-bit value in the 128-bit integer vector operand
    +///    by the specified number of bits. Low-order bits are cleared.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit integer vector containing the source operand.
    +/// \param __count
    +///    A 128-bit integer vector in which bits [63:0] specify the number of 
bits
    +///    to left-shift each value in operand \a __a.
    +/// \returns A 128-bit integer vector containing the left-shifted values.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_sll_epi64(__m128i __a, __m128i __count)
    +{
    +  return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count);
    +}
    +
    +/// Right-shifts each 16-bit value in the 128-bit integer vector operand
    +///    by the specified number of bits. High-order bits are filled with 
the sign
    +///    bit of the initial value.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit integer vector containing the source operand.
    +/// \param __count
    +///    An integer value specifying the number of bits to right-shift each 
value
    +///    in operand \a __a.
    +/// \returns A 128-bit integer vector containing the right-shifted values.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_srai_epi16(__m128i __a, int __count)
    +{
    +  return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
    +}
    +
    +/// Right-shifts each 16-bit value in the 128-bit integer vector operand
    +///    by the specified number of bits. High-order bits are filled with 
the sign
    +///    bit of the initial value.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit integer vector containing the source operand.
    +/// \param __count
    +///    A 128-bit integer vector in which bits [63:0] specify the number of 
bits
    +///    to right-shift each value in operand \a __a.
    +/// \returns A 128-bit integer vector containing the right-shifted values.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_sra_epi16(__m128i __a, __m128i __count)
    +{
    +  return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
    +}
    +
    +/// Right-shifts each 32-bit value in the 128-bit integer vector operand
    +///    by the specified number of bits. High-order bits are filled with 
the sign
    +///    bit of the initial value.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit integer vector containing the source operand.
    +/// \param __count
    +///    An integer value specifying the number of bits to right-shift each 
value
    +///    in operand \a __a.
    +/// \returns A 128-bit integer vector containing the right-shifted values.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_srai_epi32(__m128i __a, int __count)
    +{
    +  return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
    +}
    +
    +/// Right-shifts each 32-bit value in the 128-bit integer vector operand
    +///    by the specified number of bits. High-order bits are filled with 
the sign
    +///    bit of the initial value.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit integer vector containing the source operand.
    +/// \param __count
    +///    A 128-bit integer vector in which bits [63:0] specify the number of 
bits
    +///    to right-shift each value in operand \a __a.
    +/// \returns A 128-bit integer vector containing the right-shifted values.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_sra_epi32(__m128i __a, __m128i __count)
    +{
    +  return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
    +}
    +
    +/// Right-shifts the 128-bit integer vector operand by the specified
    +///    number of bytes. High-order bits are cleared.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m128i _mm_srli_si128(__m128i a, const int imm);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VPSRLDQ / PSRLDQ </c> 
instruction.
    +///
    +/// \param a
    +///    A 128-bit integer vector containing the source operand.
    +/// \param imm
    +///    An immediate value specifying the number of bytes to right-shift 
operand
    +///    \a a.
    +/// \returns A 128-bit integer vector containing the right-shifted value.
    +#ifdef __GNUC__
    +#define _mm_bsrli_si128(a, n) \
    +    ((__m128i)__builtin_ia32_psrldqi128 ((__m128i)(a), (int)(n) * 8))
    +#define _mm_srli_si128(a, n) \
    +    ((__m128i)__builtin_ia32_psrldqi128 ((__m128i)(a), (int)(n) * 8))
    +#else
    +#define _mm_srli_si128(a, imm) \
    +  (__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), 
(int)(imm))
    +#define _mm_bsrli_si128(a, imm) \
    +  (__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), 
(int)(imm))
    +#endif
    +
    +
    +/// Right-shifts each of 16-bit values in the 128-bit integer vector
    +///    operand by the specified number of bits. High-order bits are 
cleared.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit integer vector containing the source operand.
    +/// \param __count
    +///    An integer value specifying the number of bits to right-shift each 
value
    +///    in operand \a __a.
    +/// \returns A 128-bit integer vector containing the right-shifted values.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_srli_epi16(__m128i __a, int __count)
    +{
    +  return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
    +}
    +
    +/// Right-shifts each of 16-bit values in the 128-bit integer vector
    +///    operand by the specified number of bits. High-order bits are 
cleared.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit integer vector containing the source operand.
    +/// \param __count
    +///    A 128-bit integer vector in which bits [63:0] specify the number of 
bits
    +///    to right-shift each value in operand \a __a.
    +/// \returns A 128-bit integer vector containing the right-shifted values.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_srl_epi16(__m128i __a, __m128i __count)
    +{
    +  return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
    +}
    +
    +/// Right-shifts each of 32-bit values in the 128-bit integer vector
    +///    operand by the specified number of bits. High-order bits are 
cleared.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit integer vector containing the source operand.
    +/// \param __count
    +///    An integer value specifying the number of bits to right-shift each 
value
    +///    in operand \a __a.
    +/// \returns A 128-bit integer vector containing the right-shifted values.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_srli_epi32(__m128i __a, int __count)
    +{
    +  return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
    +}
    +
    +/// Right-shifts each of 32-bit values in the 128-bit integer vector
    +///    operand by the specified number of bits. High-order bits are 
cleared.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit integer vector containing the source operand.
    +/// \param __count
    +///    A 128-bit integer vector in which bits [63:0] specify the number of 
bits
    +///    to right-shift each value in operand \a __a.
    +/// \returns A 128-bit integer vector containing the right-shifted values.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_srl_epi32(__m128i __a, __m128i __count)
    +{
    +  return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
    +}
    +
    +/// Right-shifts each of 64-bit values in the 128-bit integer vector
    +///    operand by the specified number of bits. High-order bits are 
cleared.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit integer vector containing the source operand.
    +/// \param __count
    +///    An integer value specifying the number of bits to right-shift each 
value
    +///    in operand \a __a.
    +/// \returns A 128-bit integer vector containing the right-shifted values.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_srli_epi64(__m128i __a, int __count)
    +{
    +  return __builtin_ia32_psrlqi128((__v2di)__a, __count);
    +}
    +
    +/// Right-shifts each of 64-bit values in the 128-bit integer vector
    +///    operand by the specified number of bits. High-order bits are 
cleared.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit integer vector containing the source operand.
    +/// \param __count
    +///    A 128-bit integer vector in which bits [63:0] specify the number of 
bits
    +///    to right-shift each value in operand \a __a.
    +/// \returns A 128-bit integer vector containing the right-shifted values.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_srl_epi64(__m128i __a, __m128i __count)
    +{
    +  return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count);
    +}
    +
    +/// Compares each of the corresponding 8-bit values of the 128-bit
    +///    integer vectors for equality. Each comparison yields 0x0 for false, 
0xFF
    +///    for true.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPCMPEQB / PCMPEQB </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit integer vector.
    +/// \param __b
    +///    A 128-bit integer vector.
    +/// \returns A 128-bit integer vector containing the comparison results.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_cmpeq_epi8(__m128i __a, __m128i __b)
    +{
    +  return (__m128i)((__v16qi)__a == (__v16qi)__b);
    +}
    +
    +/// Compares each of the corresponding 16-bit values of the 128-bit
    +///    integer vectors for equality. Each comparison yields 0x0 for false,
    +///    0xFFFF for true.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPCMPEQW / PCMPEQW </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit integer vector.
    +/// \param __b
    +///    A 128-bit integer vector.
    +/// \returns A 128-bit integer vector containing the comparison results.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_cmpeq_epi16(__m128i __a, __m128i __b)
    +{
    +  return (__m128i)((__v8hi)__a == (__v8hi)__b);
    +}
    +
    +/// Compares each of the corresponding 32-bit values of the 128-bit
    +///    integer vectors for equality. Each comparison yields 0x0 for false,
    +///    0xFFFFFFFF for true.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPCMPEQD / PCMPEQD </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit integer vector.
    +/// \param __b
    +///    A 128-bit integer vector.
    +/// \returns A 128-bit integer vector containing the comparison results.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_cmpeq_epi32(__m128i __a, __m128i __b)
    +{
    +  return (__m128i)((__v4si)__a == (__v4si)__b);
    +}
    +
    +/// Compares each of the corresponding signed 8-bit values of the 128-bit
    +///    integer vectors to determine if the values in the first operand are
    +///    greater than those in the second operand. Each comparison yields 
0x0 for
    +///    false, 0xFF for true.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit integer vector.
    +/// \param __b
    +///    A 128-bit integer vector.
    +/// \returns A 128-bit integer vector containing the comparison results.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_cmpgt_epi8(__m128i __a, __m128i __b)
    +{
    +  /* This function always performs a signed comparison, but __v16qi is a 
char
    +     which may be signed or unsigned, so use __v16qs. */
    +  return (__m128i)((__v16qs)__a > (__v16qs)__b);
    +}
    +
    +/// Compares each of the corresponding signed 16-bit values of the
    +///    128-bit integer vectors to determine if the values in the first 
operand
    +///    are greater than those in the second operand.
    +///
    +///    Each comparison yields 0x0 for false, 0xFFFF for true.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit integer vector.
    +/// \param __b
    +///    A 128-bit integer vector.
    +/// \returns A 128-bit integer vector containing the comparison results.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_cmpgt_epi16(__m128i __a, __m128i __b)
    +{
    +  return (__m128i)((__v8hi)__a > (__v8hi)__b);
    +}
    +
    +/// Compares each of the corresponding signed 32-bit values of the
    +///    128-bit integer vectors to determine if the values in the first 
operand
    +///    are greater than those in the second operand.
    +///
    +///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit integer vector.
    +/// \param __b
    +///    A 128-bit integer vector.
    +/// \returns A 128-bit integer vector containing the comparison results.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_cmpgt_epi32(__m128i __a, __m128i __b)
    +{
    +  return (__m128i)((__v4si)__a > (__v4si)__b);
    +}
    +
    +/// Compares each of the corresponding signed 8-bit values of the 128-bit
    +///    integer vectors to determine if the values in the first operand are 
less
    +///    than those in the second operand.
    +///
    +///    Each comparison yields 0x0 for false, 0xFF for true.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit integer vector.
    +/// \param __b
    +///    A 128-bit integer vector.
    +/// \returns A 128-bit integer vector containing the comparison results.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_cmplt_epi8(__m128i __a, __m128i __b)
    +{
    +  return _mm_cmpgt_epi8(__b, __a);
    +}
    +
    +/// Compares each of the corresponding signed 16-bit values of the
    +///    128-bit integer vectors to determine if the values in the first 
operand
    +///    are less than those in the second operand.
    +///
    +///    Each comparison yields 0x0 for false, 0xFFFF for true.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit integer vector.
    +/// \param __b
    +///    A 128-bit integer vector.
    +/// \returns A 128-bit integer vector containing the comparison results.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_cmplt_epi16(__m128i __a, __m128i __b)
    +{
    +  return _mm_cmpgt_epi16(__b, __a);
    +}
    +
    +/// Compares each of the corresponding signed 32-bit values of the
    +///    128-bit integer vectors to determine if the values in the first 
operand
    +///    are less than those in the second operand.
    +///
    +///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit integer vector.
    +/// \param __b
    +///    A 128-bit integer vector.
    +/// \returns A 128-bit integer vector containing the comparison results.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_cmplt_epi32(__m128i __a, __m128i __b)
    +{
    +  return _mm_cmpgt_epi32(__b, __a);
    +}
    +
    +#ifdef __x86_64__
    +/// Converts a 64-bit signed integer value from the second operand into a
    +///    double-precision value and returns it in the lower element of a [2 x
    +///    double] vector; the upper element of the returned vector is copied 
from
    +///    the upper element of the first operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double]. The upper 64 bits of this operand 
are
    +///    copied to the upper 64 bits of the destination.
    +/// \param __b
    +///    A 64-bit signed integer operand containing the value to be 
converted.
    +/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain 
the
    +///    converted value of the second operand. The upper 64 bits are copied 
from
    +///    the upper 64 bits of the first operand.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_cvtsi64_sd(__m128d __a, long long __b)
    +{
    +  __a[0] = __b;
    +  return __a;
    +}
    +
    +/// Converts the first (lower) element of a vector of [2 x double] into a
    +///    64-bit signed integer value, according to the current rounding mode.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
    +///    conversion.
    +/// \returns A 64-bit signed integer containing the converted value.
    +static __inline__ long long __DEFAULT_FN_ATTRS
    +_mm_cvtsd_si64(__m128d __a)
    +{
    +  return __builtin_ia32_cvtsd2si64((__v2df)__a);
    +}
    +
    +/// Converts the first (lower) element of a vector of [2 x double] into a
    +///    64-bit signed integer value, truncating the result when it is 
inexact.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
    +///   instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
    +///    conversion.
    +/// \returns A 64-bit signed integer containing the converted value.
    +static __inline__ long long __DEFAULT_FN_ATTRS
    +_mm_cvttsd_si64(__m128d __a)
    +{
    +  return __builtin_ia32_cvttsd2si64((__v2df)__a);
    +}
    +#endif
    +
    +/// Converts a vector of [4 x i32] into a vector of [4 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCVTDQ2PS / CVTDQ2PS </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit integer vector.
    +/// \returns A 128-bit vector of [4 x float] containing the converted 
values.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_cvtepi32_ps(__m128i __a)
    +{
    +  return (__m128)__builtin_convertvector((__v4si)__a, __v4sf);
    +}
    +
    +/// Converts a vector of [4 x float] into a vector of [4 x i32].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCVTPS2DQ / CVTPS2DQ </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float].
    +/// \returns A 128-bit integer vector of [4 x i32] containing the converted
    +///    values.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_cvtps_epi32(__m128 __a)
    +{
    +  return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a);
    +}
    +
    +/// Converts a vector of [4 x float] into a vector of [4 x i32],
    +///    truncating the result when it is inexact.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCVTTPS2DQ / CVTTPS2DQ </c>
    +///   instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float].
    +/// \returns A 128-bit vector of [4 x i32] containing the converted values.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_cvttps_epi32(__m128 __a)
    +{
    +  return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__a);
    +}
    +
    +/// Returns a vector of [4 x i32] where the lowest element is the input
    +///    operand and the remaining elements are zero.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
    +///
    +/// \param __a
    +///    A 32-bit signed integer operand.
    +/// \returns A 128-bit vector of [4 x i32].
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_cvtsi32_si128(int __a)
    +{
    +  return __extension__ (__m128i)(__v4si){ __a, 0, 0, 0 };
    +}
    +
    +#ifdef __x86_64__
    +/// Returns a vector of [2 x i64] where the lower element is the input
    +///    operand and the upper element is zero.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
    +///
    +/// \param __a
    +///    A 64-bit signed integer operand containing the value to be 
converted.
    +/// \returns A 128-bit vector of [2 x i64] containing the converted value.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_cvtsi64_si128(long long __a)
    +{
    +  return __extension__ (__m128i)(__v2di){ __a, 0 };
    +}
    +#endif
    +
    +/// Moves the least significant 32 bits of a vector of [4 x i32] to a
    +///    32-bit signed integer value.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
    +///
    +/// \param __a
    +///    A vector of [4 x i32]. The least significant 32 bits are moved to 
the
    +///    destination.
    +/// \returns A 32-bit signed integer containing the moved value.
    +static __inline__ int __DEFAULT_FN_ATTRS
    +_mm_cvtsi128_si32(__m128i __a)
    +{
    +  __v4si __b = (__v4si)__a;
    +  return __b[0];
    +}
    +
    +#ifdef __x86_64__
    +/// Moves the least significant 64 bits of a vector of [2 x i64] to a
    +///    64-bit signed integer value.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
    +///
    +/// \param __a
    +///    A vector of [2 x i64]. The least significant 64 bits are moved to 
the
    +///    destination.
    +/// \returns A 64-bit signed integer containing the moved value.
    +static __inline__ long long __DEFAULT_FN_ATTRS
    +_mm_cvtsi128_si64(__m128i __a)
    +{
    +  return __a[0];
    +}
    +#endif
    +
    +/// Moves packed integer values from an aligned 128-bit memory location
    +///    to elements in a 128-bit integer vector.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVDQA / MOVDQA </c> 
instruction.
    +///
    +/// \param __p
    +///    An aligned pointer to a memory location containing integer values.
    +/// \returns A 128-bit integer vector containing the moved values.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_load_si128(__m128i const *__p)
    +{
    +  return *__p;
    +}
    +
    +/// Moves packed integer values from an unaligned 128-bit memory location
    +///    to elements in a 128-bit integer vector.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVDQU / MOVDQU </c> 
instruction.
    +///
    +/// \param __p
    +///    A pointer to a memory location containing integer values.
    +/// \returns A 128-bit integer vector containing the moved values.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_loadu_si128(__m128i const *__p)
    +{
    +  struct __loadu_si128 {
    +    __m128i __v;
    +  } __attribute__((__packed__, __may_alias__));
    +  return ((struct __loadu_si128*)__p)->__v;
    +}
    +
    +/// Returns a vector of [2 x i64] where the lower element is taken from
    +///    the lower element of the operand, and the upper element is zero.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
    +///
    +/// \param __p
    +///    A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits 
[63:0] of
    +///    the destination.
    +/// \returns A 128-bit vector of [2 x i64]. The lower order bits contain 
the
    +///    moved value. The higher order bits are cleared.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_loadl_epi64(__m128i const *__p)
    +{
    +  struct __mm_loadl_epi64_struct {
    +    long long __u;
    +  } __attribute__((__packed__, __may_alias__));
    +  return __extension__ (__m128i) { ((struct 
__mm_loadl_epi64_struct*)__p)->__u, 0};
    +}
    +
    +/// Generates a 128-bit vector of [4 x i32] with unspecified content.
    +///    This could be used as an argument to another intrinsic function 
where the
    +///    argument is required but the value is not actually used.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic has no corresponding instruction.
    +///
    +/// \returns A 128-bit vector of [4 x i32] with unspecified content.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_undefined_si128(void)
    +{
    +#ifdef __GNUC__
    +  __m128i __X = __X;
    +  return __X;
    +#else
    +  return (__m128i)__builtin_ia32_undef128();
    +#endif
    +}
    +
    +/// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
    +///    the specified 64-bit integer values.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic is a utility function and does not correspond to a 
specific
    +///    instruction.
    +///
    +/// \param __q1
    +///    A 64-bit integer value used to initialize the upper 64 bits of the
    +///    destination vector of [2 x i64].
    +/// \param __q0
    +///    A 64-bit integer value used to initialize the lower 64 bits of the
    +///    destination vector of [2 x i64].
    +/// \returns An initialized 128-bit vector of [2 x i64] containing the 
values
    +///    provided in the operands.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_set_epi64x(long long __q1, long long __q0)
    +{
    +  return __extension__ (__m128i)(__v2di){ __q0, __q1 };
    +}
    +
    +/// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
    +///    the specified 64-bit integer values.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic is a utility function and does not correspond to a 
specific
    +///    instruction.
    +///
    +/// \param __q1
    +///    A 64-bit integer value used to initialize the upper 64 bits of the
    +///    destination vector of [2 x i64].
    +/// \param __q0
    +///    A 64-bit integer value used to initialize the lower 64 bits of the
    +///    destination vector of [2 x i64].
    +/// \returns An initialized 128-bit vector of [2 x i64] containing the 
values
    +///    provided in the operands.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_set_epi64(__m64 __q1, __m64 __q0)
    +{
    +  return _mm_set_epi64x((long long)__q1, (long long)__q0);
    +}
    +
    +/// Initializes the 32-bit values in a 128-bit vector of [4 x i32] with
    +///    the specified 32-bit integer values.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic is a utility function and does not correspond to a 
specific
    +///    instruction.
    +///
    +/// \param __i3
    +///    A 32-bit integer value used to initialize bits [127:96] of the
    +///    destination vector.
    +/// \param __i2
    +///    A 32-bit integer value used to initialize bits [95:64] of the 
destination
    +///    vector.
    +/// \param __i1
    +///    A 32-bit integer value used to initialize bits [63:32] of the 
destination
    +///    vector.
    +/// \param __i0
    +///    A 32-bit integer value used to initialize bits [31:0] of the 
destination
    +///    vector.
    +/// \returns An initialized 128-bit vector of [4 x i32] containing the 
values
    +///    provided in the operands.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_set_epi32(int __i3, int __i2, int __i1, int __i0)
    +{
    +  return __extension__ (__m128i)(__v4si){ __i0, __i1, __i2, __i3};
    +}
    +
    +/// Initializes the 16-bit values in a 128-bit vector of [8 x i16] with
    +///    the specified 16-bit integer values.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic is a utility function and does not correspond to a 
specific
    +///    instruction.
    +///
    +/// \param __w7
    +///    A 16-bit integer value used to initialize bits [127:112] of the
    +///    destination vector.
    +/// \param __w6
    +///    A 16-bit integer value used to initialize bits [111:96] of the
    +///    destination vector.
    +/// \param __w5
    +///    A 16-bit integer value used to initialize bits [95:80] of the 
destination
    +///    vector.
    +/// \param __w4
    +///    A 16-bit integer value used to initialize bits [79:64] of the 
destination
    +///    vector.
    +/// \param __w3
    +///    A 16-bit integer value used to initialize bits [63:48] of the 
destination
    +///    vector.
    +/// \param __w2
    +///    A 16-bit integer value used to initialize bits [47:32] of the 
destination
    +///    vector.
    +/// \param __w1
    +///    A 16-bit integer value used to initialize bits [31:16] of the 
destination
    +///    vector.
    +/// \param __w0
    +///    A 16-bit integer value used to initialize bits [15:0] of the 
destination
    +///    vector.
    +/// \returns An initialized 128-bit vector of [8 x i16] containing the 
values
    +///    provided in the operands.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, 
short __w2, short __w1, short __w0)
    +{
    +  return __extension__ (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, 
__w5, __w6, __w7 };
    +}
    +
    +/// Initializes the 8-bit values in a 128-bit vector of [16 x i8] with
    +///    the specified 8-bit integer values.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic is a utility function and does not correspond to a 
specific
    +///    instruction.
    +///
    +/// \param __b15
    +///    Initializes bits [127:120] of the destination vector.
    +/// \param __b14
    +///    Initializes bits [119:112] of the destination vector.
    +/// \param __b13
    +///    Initializes bits [111:104] of the destination vector.
    +/// \param __b12
    +///    Initializes bits [103:96] of the destination vector.
    +/// \param __b11
    +///    Initializes bits [95:88] of the destination vector.
    +/// \param __b10
    +///    Initializes bits [87:80] of the destination vector.
    +/// \param __b9
    +///    Initializes bits [79:72] of the destination vector.
    +/// \param __b8
    +///    Initializes bits [71:64] of the destination vector.
    +/// \param __b7
    +///    Initializes bits [63:56] of the destination vector.
    +/// \param __b6
    +///    Initializes bits [55:48] of the destination vector.
    +/// \param __b5
    +///    Initializes bits [47:40] of the destination vector.
    +/// \param __b4
    +///    Initializes bits [39:32] of the destination vector.
    +/// \param __b3
    +///    Initializes bits [31:24] of the destination vector.
    +/// \param __b2
    +///    Initializes bits [23:16] of the destination vector.
    +/// \param __b1
    +///    Initializes bits [15:8] of the destination vector.
    +/// \param __b0
    +///    Initializes bits [7:0] of the destination vector.
    +/// \returns An initialized 128-bit vector of [16 x i8] containing the 
values
    +///    provided in the operands.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, 
char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, 
char __b3, char __b2, char __b1, char __b0)
    +{
    +  return __extension__ (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, 
__b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 };
    +}
    +
    +/// Initializes both values in a 128-bit integer vector with the
    +///    specified 64-bit integer value.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic is a utility function and does not correspond to a 
specific
    +///    instruction.
    +///
    +/// \param __q
    +///    Integer value used to initialize the elements of the destination 
integer
    +///    vector.
    +/// \returns An initialized 128-bit integer vector of [2 x i64] with both
    +///    elements containing the value provided in the operand.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_set1_epi64x(long long __q)
    +{
    +  return _mm_set_epi64x(__q, __q);
    +}
    +
    +/// Initializes both values in a 128-bit vector of [2 x i64] with the
    +///    specified 64-bit value.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic is a utility function and does not correspond to a 
specific
    +///    instruction.
    +///
    +/// \param __q
    +///    A 64-bit value used to initialize the elements of the destination 
integer
    +///    vector.
    +/// \returns An initialized 128-bit vector of [2 x i64] with all elements
    +///    containing the value provided in the operand.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_set1_epi64(__m64 __q)
    +{
    +  return _mm_set_epi64(__q, __q);
    +}
    +
    +/// Initializes all values in a 128-bit vector of [4 x i32] with the
    +///    specified 32-bit value.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic is a utility function and does not correspond to a 
specific
    +///    instruction.
    +///
    +/// \param __i
    +///    A 32-bit value used to initialize the elements of the destination 
integer
    +///    vector.
    +/// \returns An initialized 128-bit vector of [4 x i32] with all elements
    +///    containing the value provided in the operand.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_set1_epi32(int __i)
    +{
    +  return _mm_set_epi32(__i, __i, __i, __i);
    +}
    +
    +/// Initializes all values in a 128-bit vector of [8 x i16] with the
    +///    specified 16-bit value.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic is a utility function and does not correspond to a 
specific
    +///    instruction.
    +///
    +/// \param __w
    +///    A 16-bit value used to initialize the elements of the destination 
integer
    +///    vector.
    +/// \returns An initialized 128-bit vector of [8 x i16] with all elements
    +///    containing the value provided in the operand.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_set1_epi16(short __w)
    +{
    +  return _mm_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w);
    +}
    +
    +/// Initializes all values in a 128-bit vector of [16 x i8] with the
    +///    specified 8-bit value.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic is a utility function and does not correspond to a 
specific
    +///    instruction.
    +///
    +/// \param __b
    +///    An 8-bit value used to initialize the elements of the destination 
integer
    +///    vector.
    +/// \returns An initialized 128-bit vector of [16 x i8] with all elements
    +///    containing the value provided in the operand.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_set1_epi8(char __b)
    +{
    +  return _mm_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b, __b, __b, 
__b, __b, __b, __b, __b, __b);
    +}
    +
    +/// Constructs a 128-bit integer vector, initialized in reverse order
    +///     with the specified 64-bit integral values.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic does not correspond to a specific instruction.
    +///
    +/// \param __q0
    +///    A 64-bit integral value used to initialize the lower 64 bits of the
    +///    result.
    +/// \param __q1
    +///    A 64-bit integral value used to initialize the upper 64 bits of the
    +///    result.
    +/// \returns An initialized 128-bit integer vector.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_setr_epi64(__m64 __q0, __m64 __q1)
    +{
    +  return _mm_set_epi64(__q1, __q0);
    +}
    +
    +/// Constructs a 128-bit integer vector, initialized in reverse order
    +///     with the specified 32-bit integral values.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic is a utility function and does not correspond to a 
specific
    +///    instruction.
    +///
    +/// \param __i0
    +///    A 32-bit integral value used to initialize bits [31:0] of the 
result.
    +/// \param __i1
    +///    A 32-bit integral value used to initialize bits [63:32] of the 
result.
    +/// \param __i2
    +///    A 32-bit integral value used to initialize bits [95:64] of the 
result.
    +/// \param __i3
    +///    A 32-bit integral value used to initialize bits [127:96] of the 
result.
    +/// \returns An initialized 128-bit integer vector.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_setr_epi32(int __i0, int __i1, int __i2, int __i3)
    +{
    +  return _mm_set_epi32(__i3, __i2, __i1, __i0);
    +}
    +
    +/// Constructs a 128-bit integer vector, initialized in reverse order
    +///     with the specified 16-bit integral values.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic is a utility function and does not correspond to a 
specific
    +///    instruction.
    +///
    +/// \param __w0
    +///    A 16-bit integral value used to initialize bits [15:0] of the 
result.
    +/// \param __w1
    +///    A 16-bit integral value used to initialize bits [31:16] of the 
result.
    +/// \param __w2
    +///    A 16-bit integral value used to initialize bits [47:32] of the 
result.
    +/// \param __w3
    +///    A 16-bit integral value used to initialize bits [63:48] of the 
result.
    +/// \param __w4
    +///    A 16-bit integral value used to initialize bits [79:64] of the 
result.
    +/// \param __w5
    +///    A 16-bit integral value used to initialize bits [95:80] of the 
result.
    +/// \param __w6
    +///    A 16-bit integral value used to initialize bits [111:96] of the 
result.
    +/// \param __w7
    +///    A 16-bit integral value used to initialize bits [127:112] of the 
result.
    +/// \returns An initialized 128-bit integer vector.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, 
short __w5, short __w6, short __w7)
    +{
    +  return _mm_set_epi16(__w7, __w6, __w5, __w4, __w3, __w2, __w1, __w0);
    +}
    +
    +/// Constructs a 128-bit integer vector, initialized in reverse order
    +///     with the specified 8-bit integral values.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic is a utility function and does not correspond to a 
specific
    +///    instruction.
    +///
    +/// \param __b0
    +///    An 8-bit integral value used to initialize bits [7:0] of the result.
    +/// \param __b1
    +///    An 8-bit integral value used to initialize bits [15:8] of the 
result.
    +/// \param __b2
    +///    An 8-bit integral value used to initialize bits [23:16] of the 
result.
    +/// \param __b3
    +///    An 8-bit integral value used to initialize bits [31:24] of the 
result.
    +/// \param __b4
    +///    An 8-bit integral value used to initialize bits [39:32] of the 
result.
    +/// \param __b5
    +///    An 8-bit integral value used to initialize bits [47:40] of the 
result.
    +/// \param __b6
    +///    An 8-bit integral value used to initialize bits [55:48] of the 
result.
    +/// \param __b7
    +///    An 8-bit integral value used to initialize bits [63:56] of the 
result.
    +/// \param __b8
    +///    An 8-bit integral value used to initialize bits [71:64] of the 
result.
    +/// \param __b9
    +///    An 8-bit integral value used to initialize bits [79:72] of the 
result.
    +/// \param __b10
    +///    An 8-bit integral value used to initialize bits [87:80] of the 
result.
    +/// \param __b11
    +///    An 8-bit integral value used to initialize bits [95:88] of the 
result.
    +/// \param __b12
    +///    An 8-bit integral value used to initialize bits [103:96] of the 
result.
    +/// \param __b13
    +///    An 8-bit integral value used to initialize bits [111:104] of the 
result.
    +/// \param __b14
    +///    An 8-bit integral value used to initialize bits [119:112] of the 
result.
    +/// \param __b15
    +///    An 8-bit integral value used to initialize bits [127:120] of the 
result.
    +/// \returns An initialized 128-bit integer vector.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char 
__b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char 
__b12, char __b13, char __b14, char __b15)
    +{
    +  return _mm_set_epi8(__b15, __b14, __b13, __b12, __b11, __b10, __b9, 
__b8, __b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
    +}
    +
    +/// Creates a 128-bit integer vector initialized to zero.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
    +///
    +/// \returns An initialized 128-bit integer vector with all elements set to
    +///    zero.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_setzero_si128(void)
    +{
    +  return __extension__ (__m128i)(__v2di){ 0LL, 0LL };
    +}
    +
    +/// Stores a 128-bit integer vector to a memory location aligned on a
    +///    128-bit boundary.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> 
instruction.
    +///
    +/// \param __p
    +///    A pointer to an aligned memory location that will receive the 
integer
    +///    values.
    +/// \param __b
    +///    A 128-bit integer vector containing the values to be moved.
    +static __inline__ void __DEFAULT_FN_ATTRS
    +_mm_store_si128(__m128i *__p, __m128i __b)
    +{
    +  *__p = __b;
    +}
    +
    +/// Stores a 128-bit integer vector to an unaligned memory location.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> 
instruction.
    +///
    +/// \param __p
    +///    A pointer to a memory location that will receive the integer values.
    +/// \param __b
    +///    A 128-bit integer vector containing the values to be moved.
    +static __inline__ void __DEFAULT_FN_ATTRS
    +_mm_storeu_si128(__m128i *__p, __m128i __b)
    +{
    +  struct __storeu_si128 {
    +    __m128i __v;
    +  } __attribute__((__packed__, __may_alias__));
    +  ((struct __storeu_si128*)__p)->__v = __b;
    +}
    +
    +/// Moves bytes selected by the mask from the first operand to the
    +///    specified unaligned memory location. When a mask bit is 1, the
    +///    corresponding byte is written, otherwise it is not written.
    +///
    +///    To minimize caching, the data is flagged as non-temporal (unlikely 
to be
    +///    used again soon). Exception and trap behavior for elements not 
selected
    +///    for storage to memory are implementation dependent.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMASKMOVDQU / MASKMOVDQU </c>
    +///   instruction.
    +///
    +/// \param __d
    +///    A 128-bit integer vector containing the values to be moved.
    +/// \param __n
    +///    A 128-bit integer vector containing the mask. The most significant 
bit of
    +///    each byte represents the mask bits.
    +/// \param __p
    +///    A pointer to an unaligned 128-bit memory location where the 
specified
    +///    values are moved.
    +static __inline__ void __DEFAULT_FN_ATTRS
    +_mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
    +{
    +  __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
    +}
    +
    +/// Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to
    +///    a memory location.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> 
instruction.
    +///
    +/// \param __p
    +///    A pointer to a 64-bit memory location that will receive the lower 
64 bits
    +///    of the integer vector parameter.
    +/// \param __a
    +///    A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the
    +///    value to be stored.
    +static __inline__ void __DEFAULT_FN_ATTRS
    +_mm_storel_epi64(__m128i *__p, __m128i __a)
    +{
    +  struct __mm_storel_epi64_struct {
    +    long long __u;
    +  } __attribute__((__packed__, __may_alias__));
    +  ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0];
    +}
    +
    +/// Stores a 128-bit floating point vector of [2 x double] to a 128-bit
    +///    aligned memory location.
    +///
    +///    To minimize caching, the data is flagged as non-temporal (unlikely 
to be
    +///    used again soon).
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> 
instruction.
    +///
    +/// \param __p
    +///    A pointer to the 128-bit aligned memory location used to store the 
value.
    +/// \param __a
    +///    A vector of [2 x double] containing the 64-bit values to be stored.
    +static __inline__ void __DEFAULT_FN_ATTRS
    +_mm_stream_pd(double *__p, __m128d __a)
    +{
    +#ifdef __GNUC__
    +  __builtin_ia32_movntpd (__p, (__v2df)__a);
    +#else
    +  __builtin_nontemporal_store((__v2df)__a, (__v2df*)__p);
    +#endif
    +}
    +
    +/// Stores a 128-bit integer vector to a 128-bit aligned memory location.
    +///
    +///    To minimize caching, the data is flagged as non-temporal (unlikely 
to be
    +///    used again soon).
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> 
instruction.
    +///
    +/// \param __p
    +///    A pointer to the 128-bit aligned memory location used to store the 
value.
    +/// \param __a
    +///    A 128-bit integer vector containing the values to be stored.
    +static __inline__ void __DEFAULT_FN_ATTRS
    +_mm_stream_si128(__m128i *__p, __m128i __a)
    +{
    +#ifdef __GNUC__
    +  __builtin_ia32_movntdq ((__v2di *)__p, (__v2di)__a);
    +#else
    +  __builtin_nontemporal_store((__v2di)__a, (__v2di*)__p);
    +#endif
    +}
    +
    +/// Stores a 32-bit integer value in the specified memory location.
    +///
    +///    To minimize caching, the data is flagged as non-temporal (unlikely 
to be
    +///    used again soon).
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> MOVNTI </c> instruction.
    +///
    +/// \param __p
    +///    A pointer to the 32-bit memory location used to store the value.
    +/// \param __a
    +///    A 32-bit integer containing the value to be stored.
    +static __inline__ void
    +#ifdef __GNUC__
    +__attribute__((__gnu_inline__, __always_inline__, __artificial__))
    +#else
    +__attribute__((__always_inline__, __nodebug__, __target__("sse2")))
    +#endif
    +_mm_stream_si32(int *__p, int __a)
    +{
    +  __builtin_ia32_movnti(__p, __a);
    +}
    +
    +#ifdef __x86_64__
    +/// Stores a 64-bit integer value in the specified memory location.
    +///
    +///    To minimize caching, the data is flagged as non-temporal (unlikely 
to be
    +///    used again soon).
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> MOVNTIQ </c> instruction.
    +///
    +/// \param __p
    +///    A pointer to the 64-bit memory location used to store the value.
    +/// \param __a
    +///    A 64-bit integer containing the value to be stored.
    +static __inline__ void
    +#ifdef __GNUC__
    +__attribute__((__gnu_inline__, __always_inline__, __artificial__))
    +#else
    +__attribute__((__always_inline__, __nodebug__, __target__("sse2")))
    +#endif
    +_mm_stream_si64(long long *__p, long long __a)
    +{
    +  __builtin_ia32_movnti64(__p, __a);
    +}
    +#endif
    +
    +#if defined(__cplusplus)
    +extern "C" {
    +#endif
    +
    +/// The cache line containing \a __p is flushed and invalidated from all
    +///    caches in the coherency domain.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> CLFLUSH </c> instruction.
    +///
    +/// \param __p
    +///    A pointer to the memory location used to identify the cache line to 
be
    +///    flushed.
    +void _mm_clflush(void const * __p);
    +
    +/// Forces strong memory ordering (serialization) between load
    +///    instructions preceding this instruction and load instructions 
following
    +///    this instruction, ensuring the system completes all previous loads 
before
    +///    executing subsequent loads.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> LFENCE </c> instruction.
    +///
    +void _mm_lfence(void);
    +
    +/// Forces strong memory ordering (serialization) between load and store
    +///    instructions preceding this instruction and load and store 
instructions
    +///    following this instruction, ensuring that the system completes all
    +///    previous memory accesses before executing subsequent memory 
accesses.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> MFENCE </c> instruction.
    +///
    +void _mm_mfence(void);
    +
    +#if defined(__cplusplus)
    +} // extern "C"
    +#endif
    +
    +/// Converts 16-bit signed integers from both 128-bit integer vector
    +///    operands into 8-bit signed integers, and packs the results into the
    +///    destination. Positive values greater than 0x7F are saturated to 
0x7F.
    +///    Negative values less than 0x80 are saturated to 0x80.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPACKSSWB / PACKSSWB </c> 
instruction.
    +///
    +/// \param __a
    +///   A 128-bit integer vector of [8 x i16]. Each 16-bit element is 
treated as
    +///   a signed integer and is converted to a 8-bit signed integer with
    +///   saturation. Values greater than 0x7F are saturated to 0x7F. Values 
less
    +///   than 0x80 are saturated to 0x80. The converted [8 x i8] values are
    +///   written to the lower 64 bits of the result.
    +/// \param __b
    +///   A 128-bit integer vector of [8 x i16]. Each 16-bit element is 
treated as
    +///   a signed integer and is converted to a 8-bit signed integer with
    +///   saturation. Values greater than 0x7F are saturated to 0x7F. Values 
less
    +///   than 0x80 are saturated to 0x80. The converted [8 x i8] values are
    +///   written to the higher 64 bits of the result.
    +/// \returns A 128-bit vector of [16 x i8] containing the converted values.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_packs_epi16(__m128i __a, __m128i __b)
    +{
    +  return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
    +}
    +
    +/// Converts 32-bit signed integers from both 128-bit integer vector
    +///    operands into 16-bit signed integers, and packs the results into the
    +///    destination. Positive values greater than 0x7FFF are saturated to 
0x7FFF.
    +///    Negative values less than 0x8000 are saturated to 0x8000.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPACKSSDW / PACKSSDW </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit integer vector of [4 x i32]. Each 32-bit element is 
treated as
    +///    a signed integer and is converted to a 16-bit signed integer with
    +///    saturation. Values greater than 0x7FFF are saturated to 0x7FFF. 
Values
    +///    less than 0x8000 are saturated to 0x8000. The converted [4 x i16] 
values
    +///    are written to the lower 64 bits of the result.
    +/// \param __b
    +///    A 128-bit integer vector of [4 x i32]. Each 32-bit element is 
treated as
    +///    a signed integer and is converted to a 16-bit signed integer with
    +///    saturation. Values greater than 0x7FFF are saturated to 0x7FFF. 
Values
    +///    less than 0x8000 are saturated to 0x8000. The converted [4 x i16] 
values
    +///    are written to the higher 64 bits of the result.
    +/// \returns A 128-bit vector of [8 x i16] containing the converted values.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_packs_epi32(__m128i __a, __m128i __b)
    +{
    +  return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
    +}
    +
    +/// Converts 16-bit signed integers from both 128-bit integer vector
    +///    operands into 8-bit unsigned integers, and packs the results into 
the
    +///    destination. Values greater than 0xFF are saturated to 0xFF. Values 
less
    +///    than 0x00 are saturated to 0x00.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPACKUSWB / PACKUSWB </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit integer vector of [8 x i16]. Each 16-bit element is 
treated as
    +///    a signed integer and is converted to an 8-bit unsigned integer with
    +///    saturation. Values greater than 0xFF are saturated to 0xFF. Values 
less
    +///    than 0x00 are saturated to 0x00. The converted [8 x i8] values are
    +///    written to the lower 64 bits of the result.
    +/// \param __b
    +///    A 128-bit integer vector of [8 x i16]. Each 16-bit element is 
treated as
    +///    a signed integer and is converted to an 8-bit unsigned integer with
    +///    saturation. Values greater than 0xFF are saturated to 0xFF. Values 
less
    +///    than 0x00 are saturated to 0x00. The converted [8 x i8] values are
    +///    written to the higher 64 bits of the result.
    +/// \returns A 128-bit vector of [16 x i8] containing the converted values.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_packus_epi16(__m128i __a, __m128i __b)
    +{
    +  return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
    +}
    +
    +/// Extracts 16 bits from a 128-bit integer vector of [8 x i16], using
    +///    the immediate-value parameter as a selector.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit integer vector.
    +/// \param __imm
    +///    An immediate value. Bits [2:0] selects values from \a __a to be 
assigned
    +///    to bits[15:0] of the result. \n
    +///    000: assign values from bits [15:0] of \a __a. \n
    +///    001: assign values from bits [31:16] of \a __a. \n
    +///    010: assign values from bits [47:32] of \a __a. \n
    +///    011: assign values from bits [63:48] of \a __a. \n
    +///    100: assign values from bits [79:64] of \a __a. \n
    +///    101: assign values from bits [95:80] of \a __a. \n
    +///    110: assign values from bits [111:96] of \a __a. \n
    +///    111: assign values from bits [127:112] of \a __a.
    +/// \returns An integer, whose lower 16 bits are selected from the 128-bit
    +///    integer vector parameter and the remaining bits are assigned zeros.
    +#define _mm_extract_epi16(a, imm) \
    +  (int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \
    +                                                   (int)(imm))
    +
    +/// Constructs a 128-bit integer vector by first making a copy of the
    +///    128-bit integer vector parameter, and then inserting the lower 16 
bits
    +///    of an integer parameter into an offset specified by the 
immediate-value
    +///    parameter.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit integer vector of [8 x i16]. This vector is copied to the
    +///    result and then one of the eight elements in the result is replaced 
by
    +///    the lower 16 bits of \a __b.
    +/// \param __b
    +///    An integer. The lower 16 bits of this parameter are written to the
    +///    result beginning at an offset specified by \a __imm.
    +/// \param __imm
    +///    An immediate value specifying the bit offset in the result at which 
the
    +///    lower 16 bits of \a __b are written.
    +/// \returns A 128-bit integer vector containing the constructed values.
    +#define _mm_insert_epi16(a, b, imm) \
    +  (__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b), \
    +                                       (int)(imm))
    +
    +/// Copies the values of the most significant bits from each 8-bit
    +///    element in a 128-bit integer vector of [16 x i8] to create a 16-bit 
mask
    +///    value, zero-extends the value, and writes it to the destination.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPMOVMSKB / PMOVMSKB </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit integer vector containing the values with bits to be 
extracted.
    +/// \returns The most significant bits from each 8-bit element in \a __a,
    +///    written to bits [15:0]. The other bits are assigned zeros.
    +static __inline__ int __DEFAULT_FN_ATTRS
    +_mm_movemask_epi8(__m128i __a)
    +{
    +  return __builtin_ia32_pmovmskb128((__v16qi)__a);
    +}
    +
    +/// Constructs a 128-bit integer vector by shuffling four 32-bit
    +///    elements of a 128-bit integer vector parameter, using the 
immediate-value
    +///    parameter as a specifier.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m128i _mm_shuffle_epi32(__m128i a, const int imm);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> 
instruction.
    +///
    +/// \param a
    +///    A 128-bit integer vector containing the values to be copied.
    +/// \param imm
    +///    An immediate value containing an 8-bit value specifying which 
elements to
    +///    copy from a. The destinations within the 128-bit destination are 
assigned
    +///    values as follows: \n
    +///    Bits [1:0] are used to assign values to bits [31:0] of the result. 
\n
    +///    Bits [3:2] are used to assign values to bits [63:32] of the result. 
\n
    +///    Bits [5:4] are used to assign values to bits [95:64] of the result. 
\n
    +///    Bits [7:6] are used to assign values to bits [127:96] of the 
result. \n
    +///    Bit value assignments: \n
    +///    00: assign values from bits [31:0] of \a a. \n
    +///    01: assign values from bits [63:32] of \a a. \n
    +///    10: assign values from bits [95:64] of \a a. \n
    +///    11: assign values from bits [127:96] of \a a.
    +/// \returns A 128-bit integer vector containing the shuffled values.
    +#define _mm_shuffle_epi32(a, imm) \
    +  (__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm))
    +
    +/// Constructs a 128-bit integer vector by shuffling four lower 16-bit
    +///    elements of a 128-bit integer vector of [8 x i16], using the 
immediate
    +///    value parameter as a specifier.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m128i _mm_shufflelo_epi16(__m128i a, const int imm);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> 
instruction.
    +///
    +/// \param a
    +///    A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to 
bits
    +///    [127:64] of the result.
    +/// \param imm
    +///    An 8-bit immediate value specifying which elements to copy from \a 
a. \n
    +///    Bits[1:0] are used to assign values to bits [15:0] of the result. \n
    +///    Bits[3:2] are used to assign values to bits [31:16] of the result. 
\n
    +///    Bits[5:4] are used to assign values to bits [47:32] of the result. 
\n
    +///    Bits[7:6] are used to assign values to bits [63:48] of the result. 
\n
    +///    Bit value assignments: \n
    +///    00: assign values from bits [15:0] of \a a. \n
    +///    01: assign values from bits [31:16] of \a a. \n
    +///    10: assign values from bits [47:32] of \a a. \n
    +///    11: assign values from bits [63:48] of \a a. \n
    +/// \returns A 128-bit integer vector containing the shuffled values.
    +#define _mm_shufflelo_epi16(a, imm) \
    +  (__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm))
    +
    +/// Constructs a 128-bit integer vector by shuffling four upper 16-bit
    +///    elements of a 128-bit integer vector of [8 x i16], using the 
immediate
    +///    value parameter as a specifier.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m128i _mm_shufflehi_epi16(__m128i a, const int imm);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VPSHUFHW / PSHUFHW </c> 
instruction.
    +///
    +/// \param a
    +///    A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to 
bits
    +///    [63:0] of the result.
    +/// \param imm
    +///    An 8-bit immediate value specifying which elements to copy from \a 
a. \n
    +///    Bits[1:0] are used to assign values to bits [79:64] of the result. 
\n
    +///    Bits[3:2] are used to assign values to bits [95:80] of the result. 
\n
    +///    Bits[5:4] are used to assign values to bits [111:96] of the result. 
\n
    +///    Bits[7:6] are used to assign values to bits [127:112] of the 
result. \n
    +///    Bit value assignments: \n
    +///    00: assign values from bits [79:64] of \a a. \n
    +///    01: assign values from bits [95:80] of \a a. \n
    +///    10: assign values from bits [111:96] of \a a. \n
    +///    11: assign values from bits [127:112] of \a a. \n
    +/// \returns A 128-bit integer vector containing the shuffled values.
    +#define _mm_shufflehi_epi16(a, imm) \
    +  (__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm))
    +
    +/// Unpacks the high-order (index 8-15) values from two 128-bit vectors
    +///    of [16 x i8] and interleaves them into a 128-bit vector of [16 x 
i8].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPUNPCKHBW / PUNPCKHBW </c>
    +///   instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [16 x i8].
    +///    Bits [71:64] are written to bits [7:0] of the result. \n
    +///    Bits [79:72] are written to bits [23:16] of the result. \n
    +///    Bits [87:80] are written to bits [39:32] of the result. \n
    +///    Bits [95:88] are written to bits [55:48] of the result. \n
    +///    Bits [103:96] are written to bits [71:64] of the result. \n
    +///    Bits [111:104] are written to bits [87:80] of the result. \n
    +///    Bits [119:112] are written to bits [103:96] of the result. \n
    +///    Bits [127:120] are written to bits [119:112] of the result.
    +/// \param __b
    +///    A 128-bit vector of [16 x i8]. \n
    +///    Bits [71:64] are written to bits [15:8] of the result. \n
    +///    Bits [79:72] are written to bits [31:24] of the result. \n
    +///    Bits [87:80] are written to bits [47:40] of the result. \n
    +///    Bits [95:88] are written to bits [63:56] of the result. \n
    +///    Bits [103:96] are written to bits [79:72] of the result. \n
    +///    Bits [111:104] are written to bits [95:88] of the result. \n
    +///    Bits [119:112] are written to bits [111:104] of the result. \n
    +///    Bits [127:120] are written to bits [127:120] of the result.
    +/// \returns A 128-bit vector of [16 x i8] containing the interleaved 
values.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_unpackhi_epi8(__m128i __a, __m128i __b)
    +{
    +#ifdef __GNUC__
    +  return (__m128i)__builtin_ia32_punpckhbw128 ((__v16qi)__a, (__v16qi)__b);
    +#else
    +  return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 
16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 
16+15);
    +#endif
    +}
    +
    +/// Unpacks the high-order (index 4-7) values from two 128-bit vectors of
    +///    [8 x i16] and interleaves them into a 128-bit vector of [8 x i16].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPUNPCKHWD / PUNPCKHWD </c>
    +///   instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [8 x i16].
    +///    Bits [79:64] are written to bits [15:0] of the result. \n
    +///    Bits [95:80] are written to bits [47:32] of the result. \n
    +///    Bits [111:96] are written to bits [79:64] of the result. \n
    +///    Bits [127:112] are written to bits [111:96] of the result.
    +/// \param __b
    +///    A 128-bit vector of [8 x i16].
    +///    Bits [79:64] are written to bits [31:16] of the result. \n
    +///    Bits [95:80] are written to bits [63:48] of the result. \n
    +///    Bits [111:96] are written to bits [95:80] of the result. \n
    +///    Bits [127:112] are written to bits [127:112] of the result.
    +/// \returns A 128-bit vector of [8 x i16] containing the interleaved 
values.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_unpackhi_epi16(__m128i __a, __m128i __b)
    +{
    +#ifdef __GNUC__
    +  return (__m128i)__builtin_ia32_punpckhwd128 ((__v8hi)__a, (__v8hi)__b);
    +#else
    +  return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 
8+4, 5, 8+5, 6, 8+6, 7, 8+7);
    +#endif
    +}
    +
    +/// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
    +///    [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPUNPCKHDQ / PUNPCKHDQ </c>
    +///   instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x i32]. \n
    +///    Bits [95:64] are written to bits [31:0] of the destination. \n
    +///    Bits [127:96] are written to bits [95:64] of the destination.
    +/// \param __b
    +///    A 128-bit vector of [4 x i32]. \n
    +///    Bits [95:64] are written to bits [64:32] of the destination. \n
    +///    Bits [127:96] are written to bits [127:96] of the destination.
    +/// \returns A 128-bit vector of [4 x i32] containing the interleaved 
values.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_unpackhi_epi32(__m128i __a, __m128i __b)
    +{
    +#ifdef __GNUC__
    +  return (__m128i)__builtin_ia32_punpckhdq128 ((__v4si)__a, (__v4si)__b);
    +#else
    +  return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 
4+2, 3, 4+3);
    +#endif
    +}
    +
    +/// Unpacks the high-order 64-bit elements from two 128-bit vectors of
    +///    [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPUNPCKHQDQ / PUNPCKHQDQ </c>
    +///   instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x i64]. \n
    +///    Bits [127:64] are written to bits [63:0] of the destination.
    +/// \param __b
    +///    A 128-bit vector of [2 x i64]. \n
    +///    Bits [127:64] are written to bits [127:64] of the destination.
    +/// \returns A 128-bit vector of [2 x i64] containing the interleaved 
values.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_unpackhi_epi64(__m128i __a, __m128i __b)
    +{
    +#ifdef __GNUC__
    +  return (__m128i)__builtin_ia32_punpckhqdq128 ((__v2di)__a, (__v2di)__b);
    +#else
    +  return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 
2+1);
    +#endif
    +}
    +
    +/// Unpacks the low-order (index 0-7) values from two 128-bit vectors of
    +///    [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPUNPCKLBW / PUNPCKLBW </c>
    +///   instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [16 x i8]. \n
    +///    Bits [7:0] are written to bits [7:0] of the result. \n
    +///    Bits [15:8] are written to bits [23:16] of the result. \n
    +///    Bits [23:16] are written to bits [39:32] of the result. \n
    +///    Bits [31:24] are written to bits [55:48] of the result. \n
    +///    Bits [39:32] are written to bits [71:64] of the result. \n
    +///    Bits [47:40] are written to bits [87:80] of the result. \n
    +///    Bits [55:48] are written to bits [103:96] of the result. \n
    +///    Bits [63:56] are written to bits [119:112] of the result.
    +/// \param __b
    +///    A 128-bit vector of [16 x i8].
    +///    Bits [7:0] are written to bits [15:8] of the result. \n
    +///    Bits [15:8] are written to bits [31:24] of the result. \n
    +///    Bits [23:16] are written to bits [47:40] of the result. \n
    +///    Bits [31:24] are written to bits [63:56] of the result. \n
    +///    Bits [39:32] are written to bits [79:72] of the result. \n
    +///    Bits [47:40] are written to bits [95:88] of the result. \n
    +///    Bits [55:48] are written to bits [111:104] of the result. \n
    +///    Bits [63:56] are written to bits [127:120] of the result.
    +/// \returns A 128-bit vector of [16 x i8] containing the interleaved 
values.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_unpacklo_epi8(__m128i __a, __m128i __b)
    +{
    +#ifdef __GNUC__
    +  return (__m128i)__builtin_ia32_punpcklbw128 ((__v16qi)__a, (__v16qi)__b);
    +#else
    +  return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 
16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
    +#endif
    +}
    +
    +/// Unpacks the low-order (index 0-3) values from each of the two 128-bit
    +///    vectors of [8 x i16] and interleaves them into a 128-bit vector of
    +///    [8 x i16].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPUNPCKLWD / PUNPCKLWD </c>
    +///   instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [8 x i16].
    +///    Bits [15:0] are written to bits [15:0] of the result. \n
    +///    Bits [31:16] are written to bits [47:32] of the result. \n
    +///    Bits [47:32] are written to bits [79:64] of the result. \n
    +///    Bits [63:48] are written to bits [111:96] of the result.
    +/// \param __b
    +///    A 128-bit vector of [8 x i16].
    +///    Bits [15:0] are written to bits [31:16] of the result. \n
    +///    Bits [31:16] are written to bits [63:48] of the result. \n
    +///    Bits [47:32] are written to bits [95:80] of the result. \n
    +///    Bits [63:48] are written to bits [127:112] of the result.
    +/// \returns A 128-bit vector of [8 x i16] containing the interleaved 
values.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_unpacklo_epi16(__m128i __a, __m128i __b)
    +{
    +#ifdef __GNUC__
    +  return (__m128i)__builtin_ia32_punpcklwd128 ((__v8hi)__a, (__v8hi)__b);
    +#else
    +  return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 
8+0, 1, 8+1, 2, 8+2, 3, 8+3);
    +#endif
    +}
    +
    +/// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
    +///    [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPUNPCKLDQ / PUNPCKLDQ </c>
    +///   instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x i32]. \n
    +///    Bits [31:0] are written to bits [31:0] of the destination. \n
    +///    Bits [63:32] are written to bits [95:64] of the destination.
    +/// \param __b
    +///    A 128-bit vector of [4 x i32]. \n
    +///    Bits [31:0] are written to bits [64:32] of the destination. \n
    +///    Bits [63:32] are written to bits [127:96] of the destination.
    +/// \returns A 128-bit vector of [4 x i32] containing the interleaved 
values.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_unpacklo_epi32(__m128i __a, __m128i __b)
    +{
    +#ifdef __GNUC__
    +  return (__m128i)__builtin_ia32_punpckldq128 ((__v4si)__a, (__v4si)__b);
    +#else
    +  return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 
4+0, 1, 4+1);
    +#endif
    +}
    +
    +/// Unpacks the low-order 64-bit elements from two 128-bit vectors of
    +///    [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c>
    +///   instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x i64]. \n
    +///    Bits [63:0] are written to bits [63:0] of the destination. \n
    +/// \param __b
    +///    A 128-bit vector of [2 x i64]. \n
    +///    Bits [63:0] are written to bits [127:64] of the destination. \n
    +/// \returns A 128-bit vector of [2 x i64] containing the interleaved 
values.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_unpacklo_epi64(__m128i __a, __m128i __b)
    +{
    +#ifdef __GNUC__
    +  return (__m128i)__builtin_ia32_punpcklqdq128 ((__v2di)__a, (__v2di)__b);
    +#else
    +  return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 
2+0);
    +#endif
    +}
    +
    +/// Returns the lower 64 bits of a 128-bit integer vector as a 64-bit
    +///    integer.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> MOVDQ2Q </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit integer vector operand. The lower 64 bits are moved to the
    +///    destination.
    +/// \returns A 64-bit integer containing the lower 64 bits of the 
parameter.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_movepi64_pi64(__m128i __a)
    +{
    +  return (__m64)__a[0];
    +}
    +
    +/// Moves the 64-bit operand to a 128-bit integer vector, zeroing the
    +///    upper bits.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> MOVD+VMOVQ </c> instruction.
    +///
    +/// \param __a
    +///    A 64-bit value.
    +/// \returns A 128-bit integer vector. The lower 64 bits contain the value 
from
    +///    the operand. The upper 64 bits are assigned zeros.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_movpi64_epi64(__m64 __a)
    +{
    +  return __extension__ (__m128i)(__v2di){ (long long)__a, 0 };
    +}
    +
    +/// Moves the lower 64 bits of a 128-bit integer vector to a 128-bit
    +///    integer vector, zeroing the upper bits.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit integer vector operand. The lower 64 bits are moved to the
    +///    destination.
    +/// \returns A 128-bit integer vector. The lower 64 bits contain the value 
from
    +///    the operand. The upper 64 bits are assigned zeros.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_move_epi64(__m128i __a)
    +{
    +#ifdef __GNUC__
    +  return (__m128i)__builtin_ia32_movq128 ((__v2di) __a);
    +#else
    +  return __builtin_shufflevector((__v2di)__a, _mm_setzero_si128(), 0, 2);
    +#endif
    +}
    +
    +/// Unpacks the high-order 64-bit elements from two 128-bit vectors of
    +///    [2 x double] and interleaves them into a 128-bit vector of [2 x
    +///    double].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double]. \n
    +///    Bits [127:64] are written to bits [63:0] of the destination.
    +/// \param __b
    +///    A 128-bit vector of [2 x double]. \n
    +///    Bits [127:64] are written to bits [127:64] of the destination.
    +/// \returns A 128-bit vector of [2 x double] containing the interleaved 
values.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_unpackhi_pd(__m128d __a, __m128d __b)
    +{
    +#ifdef __GNUC__
    +  return (__m128d)__builtin_ia32_unpckhpd ((__v2df)__a, (__v2df)__b);
    +#else
    +  return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2+1);
    +#endif
    +}
    +
    +/// Unpacks the low-order 64-bit elements from two 128-bit vectors
    +///    of [2 x double] and interleaves them into a 128-bit vector of [2 x
    +///    double].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double]. \n
    +///    Bits [63:0] are written to bits [63:0] of the destination.
    +/// \param __b
    +///    A 128-bit vector of [2 x double]. \n
    +///    Bits [63:0] are written to bits [127:64] of the destination.
    +/// \returns A 128-bit vector of [2 x double] containing the interleaved 
values.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_unpacklo_pd(__m128d __a, __m128d __b)
    +{
    +#ifdef __GNUC__
    +  return (__m128d)__builtin_ia32_unpcklpd ((__v2df)__a, (__v2df)__b);
    +#else
    +  return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2+0);
    +#endif
    +}
    +
    +/// Extracts the sign bits of the double-precision values in the 128-bit
    +///    vector of [2 x double], zero-extends the value, and writes it to the
    +///    low-order bits of the destination.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVMSKPD / MOVMSKPD </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double] containing the values with sign 
bits to
    +///    be extracted.
    +/// \returns The sign bits from each of the double-precision elements in 
\a __a,
    +///    written to bits [1:0]. The remaining bits are assigned values of 
zero.
    +static __inline__ int __DEFAULT_FN_ATTRS
    +_mm_movemask_pd(__m128d __a)
    +{
    +  return __builtin_ia32_movmskpd((__v2df)__a);
    +}
    +
    +
    +/// Constructs a 128-bit floating-point vector of [2 x double] from two
    +///    128-bit vector parameters of [2 x double], using the immediate-value
    +///     parameter as a specifier.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VSHUFPD / SHUFPD </c> 
instruction.
    +///
    +/// \param a
    +///    A 128-bit vector of [2 x double].
    +/// \param b
    +///    A 128-bit vector of [2 x double].
    +/// \param i
    +///    An 8-bit immediate value. The least significant two bits specify 
which
    +///    elements to copy from \a a and \a b: \n
    +///    Bit[0] = 0: lower element of \a a copied to lower element of 
result. \n
    +///    Bit[0] = 1: upper element of \a a copied to lower element of 
result. \n
    +///    Bit[1] = 0: lower element of \a b copied to upper element of 
result. \n
    +///    Bit[1] = 1: upper element of \a b copied to upper element of 
result. \n
    +/// \returns A 128-bit vector of [2 x double] containing the shuffled 
values.
    +#define _mm_shuffle_pd(a, b, i) \
    +  (__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), 
(__v2df)(__m128d)(b), \
    +                                 (int)(i))
    +
    +/// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
    +///    floating-point vector of [4 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic has no corresponding instruction.
    +///
    +/// \param __a
    +///    A 128-bit floating-point vector of [2 x double].
    +/// \returns A 128-bit floating-point vector of [4 x float] containing the 
same
    +///    bitwise pattern as the parameter.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_castpd_ps(__m128d __a)
    +{
    +  return (__m128)__a;
    +}
    +
    +/// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
    +///    integer vector.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic has no corresponding instruction.
    +///
    +/// \param __a
    +///    A 128-bit floating-point vector of [2 x double].
    +/// \returns A 128-bit integer vector containing the same bitwise pattern 
as the
    +///    parameter.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_castpd_si128(__m128d __a)
    +{
    +  return (__m128i)__a;
    +}
    +
    +/// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
    +///    floating-point vector of [2 x double].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic has no corresponding instruction.
    +///
    +/// \param __a
    +///    A 128-bit floating-point vector of [4 x float].
    +/// \returns A 128-bit floating-point vector of [2 x double] containing 
the same
    +///    bitwise pattern as the parameter.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_castps_pd(__m128 __a)
    +{
    +  return (__m128d)__a;
    +}
    +
    +/// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
    +///    integer vector.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic has no corresponding instruction.
    +///
    +/// \param __a
    +///    A 128-bit floating-point vector of [4 x float].
    +/// \returns A 128-bit integer vector containing the same bitwise pattern 
as the
    +///    parameter.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_castps_si128(__m128 __a)
    +{
    +  return (__m128i)__a;
    +}
    +
    +/// Casts a 128-bit integer vector into a 128-bit floating-point vector
    +///    of [4 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic has no corresponding instruction.
    +///
    +/// \param __a
    +///    A 128-bit integer vector.
    +/// \returns A 128-bit floating-point vector of [4 x float] containing the 
same
    +///    bitwise pattern as the parameter.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_castsi128_ps(__m128i __a)
    +{
    +  return (__m128)__a;
    +}
    +
    +/// Casts a 128-bit integer vector into a 128-bit floating-point vector
    +///    of [2 x double].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic has no corresponding instruction.
    +///
    +/// \param __a
    +///    A 128-bit integer vector.
    +/// \returns A 128-bit floating-point vector of [2 x double] containing 
the same
    +///    bitwise pattern as the parameter.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_castsi128_pd(__m128i __a)
    +{
    +  return (__m128d)__a;
    +}
    +
    +#if defined(__cplusplus)
    +extern "C" {
    +#endif
    +
    +/// Indicates that a spin loop is being executed for the purposes of
    +///    optimizing power consumption during the loop.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PAUSE </c> instruction.
    +///
    +void _mm_pause(void);
    +
    +#if defined(__cplusplus)
    +} // extern "C"
    +#endif
    +#undef __DEFAULT_FN_ATTRS
    +#undef __DEFAULT_FN_ATTRS_MMX
    +
    +#ifndef _MM_DENORMALS_ZERO_ON
    +#define _MM_DENORMALS_ZERO_ON   (0x0040)
    +#endif
    +#ifndef _MM_DENORMALS_ZERO_OFF
    +#define _MM_DENORMALS_ZERO_OFF  (0x0000)
    +#endif
    +
    +#ifndef _MM_DENORMALS_ZERO_MASK
    +#define _MM_DENORMALS_ZERO_MASK (0x0040)
    +#endif
    +
    +#ifndef _MM_GET_DENORMALS_ZERO_MODE
    +#define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & 
_MM_DENORMALS_ZERO_MASK)
    +#define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & 
~_MM_DENORMALS_ZERO_MASK) | (x)))
    +#endif
    +
    +#endif /* __EMMINTRIN_H */
    diff --git a/include/immintrin.h b/include/immintrin.h
    new file mode 100644
    index 0000000..546005b
    --- /dev/null
    +++ b/include/immintrin.h
    @@ -0,0 +1,75 @@
    +/*===---- immintrin.h - Intel intrinsics 
-----------------------------------===
    + *
    + * Permission is hereby granted, free of charge, to any person obtaining a 
copy
    + * of this software and associated documentation files (the "Software"), 
to deal
    + * in the Software without restriction, including without limitation the 
rights
    + * to use, copy, modify, merge, publish, distribute, sublicense, and/or 
sell
    + * copies of the Software, and to permit persons to whom the Software is
    + * furnished to do so, subject to the following conditions:
    + *
    + * The above copyright notice and this permission notice shall be included 
in
    + * all copies or substantial portions of the Software.
    + *
    + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 
OR
    + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
THE
    + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
    + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 
IN
    + * THE SOFTWARE.
    + *
    + 
*===-----------------------------------------------------------------------===
    + */
    +
    +#ifndef __IMMINTRIN_H
    +#define __IMMINTRIN_H
    +
    +#if defined(__MMX__)
    +#include <mmintrin.h>
    +#endif
    +
    +#if defined(__SSE__)
    +#include <xmmintrin.h>
    +#endif
    +
    +#if defined(__SSE2__)
    +#include <emmintrin.h>
    +#endif
    +
    +#if defined(__SSE3__)
    +#include <pmmintrin.h>
    +#endif
    +
    +#if defined(__SSSE3__)
    +#include <tmmintrin.h>
    +#endif
    +
    +#if \
    +    (defined(__SSE4_2__) || defined(__SSE4_1__))
    +#include <smmintrin.h>
    +#endif
    +
    +#if defined(__AVX__)
    +#include <avxintrin.h>
    +#endif
    +
    +#if defined(__POPCNT__)
    +#include <popcntintrin.h>
    +#endif
    +
    +
    +/* __bit_scan_forward */
    +/*
    +static __inline__ int __attribute__((__always_inline__, __nodebug__))
    +_bit_scan_forward(int __A) {
    +  return __builtin_ctz(__A);
    +}
    +*/
    +/* __bit_scan_reverse */
    +/*
    +static __inline__ int __attribute__((__always_inline__, __nodebug__))
    +_bit_scan_reverse(int __A) {
    +  return 31 - __builtin_clz(__A);
    +}
    +*/
    +#endif /* __IMMINTRIN_H */
    diff --git a/include/mm_malloc.h b/include/mm_malloc.h
    new file mode 100644
    index 0000000..305afd3
    --- /dev/null
    +++ b/include/mm_malloc.h
    @@ -0,0 +1,75 @@
    +/*===---- mm_malloc.h - Allocating and Freeing Aligned Memory Blocks 
-------===
    + *
    + * Permission is hereby granted, free of charge, to any person obtaining a 
copy
    + * of this software and associated documentation files (the "Software"), 
to deal
    + * in the Software without restriction, including without limitation the 
rights
    + * to use, copy, modify, merge, publish, distribute, sublicense, and/or 
sell
    + * copies of the Software, and to permit persons to whom the Software is
    + * furnished to do so, subject to the following conditions:
    + *
    + * The above copyright notice and this permission notice shall be included 
in
    + * all copies or substantial portions of the Software.
    + *
    + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 
OR
    + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
THE
    + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
    + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 
IN
    + * THE SOFTWARE.
    + *
    + 
*===-----------------------------------------------------------------------===
    + */
    +
    +#ifndef __MM_MALLOC_H
    +#define __MM_MALLOC_H
    +
    +#include <stdlib.h>
    +
    +#ifdef _WIN32
    +#include <malloc.h>
    +#else
    +#ifndef __cplusplus
    +extern int posix_memalign(void **__memptr, size_t __alignment, size_t 
__size);
    +#else
    +// Some systems (e.g. those with GNU libc) declare posix_memalign with an
    +// exception specifier. Via an "egregious workaround" in
    +// Sema::CheckEquivalentExceptionSpec, Clang accepts the following as a 
valid
    +// redeclaration of glibc's declaration.
    +extern "C" int posix_memalign(void **__memptr, size_t __alignment, size_t 
__size);
    +#endif
    +#endif
    +
    +#if !(defined(_WIN32) && defined(_mm_malloc))
    +static __inline__ void *__attribute__((__always_inline__, __nodebug__,
    +                                       __malloc__))
    +_mm_malloc(size_t __size, size_t __align)
    +{
    +  if (__align == 1) {
    +    return malloc(__size);
    +  }
    +
    +  if (!(__align & (__align - 1)) && __align < sizeof(void *))
    +    __align = sizeof(void *);
    +
    +  void *__mallocedMemory;
    +#if defined(__MINGW32__)
    +  __mallocedMemory = __mingw_aligned_malloc(__size, __align);
    +#elif defined(_WIN32)
    +  __mallocedMemory = _aligned_malloc(__size, __align);
    +#else
    +  if (posix_memalign(&__mallocedMemory, __align, __size))
    +    return 0;
    +#endif
    +
    +  return __mallocedMemory;
    +}
    +
    +static __inline__ void __attribute__((__always_inline__, __nodebug__))
    +_mm_free(void *__p)
    +{
    +  free(__p);
    +}
    +#endif
    +
    +#endif /* __MM_MALLOC_H */
    diff --git a/include/mmintrin.h b/include/mmintrin.h
    new file mode 100644
    index 0000000..a5c2829
    --- /dev/null
    +++ b/include/mmintrin.h
    @@ -0,0 +1,1598 @@
    +/*===---- mmintrin.h - MMX intrinsics 
--------------------------------------===
    + *
    + * Permission is hereby granted, free of charge, to any person obtaining a 
copy
    + * of this software and associated documentation files (the "Software"), 
to deal
    + * in the Software without restriction, including without limitation the 
rights
    + * to use, copy, modify, merge, publish, distribute, sublicense, and/or 
sell
    + * copies of the Software, and to permit persons to whom the Software is
    + * furnished to do so, subject to the following conditions:
    + *
    + * The above copyright notice and this permission notice shall be included 
in
    + * all copies or substantial portions of the Software.
    + *
    + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 
OR
    + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
THE
    + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
    + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 
IN
    + * THE SOFTWARE.
    + *
    + 
*===-----------------------------------------------------------------------===
    + */
    +
    +#ifndef __MMINTRIN_H
    +#define __MMINTRIN_H
    +
    +#ifdef __GNUC__
    +typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
    +#else
    +typedef long long __m64 __attribute__((__vector_size__(8)));
    +#endif
    +typedef int __m64_u __attribute__ ((__vector_size__ (8), __may_alias__, 
__aligned__ (1)));
    +
    +typedef long long __v1di __attribute__((__vector_size__(8)));
    +typedef int __v2si __attribute__((__vector_size__(8)));
    +typedef short __v4hi __attribute__((__vector_size__(8)));
    +typedef char __v8qi __attribute__((__vector_size__(8)));
    +typedef float __v2sf __attribute__ ((__vector_size__ (8)));
    +/* Define the default attributes for the functions in this file. */
    +#ifdef __GNUC__
    +#define __DEFAULT_FN_ATTRS __attribute__((__gnu_inline__, 
__always_inline__, __artificial__))
    +#else
    +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, 
__target__("mmx"), __min_vector_width__(64)))
    +#endif
    +
    +/// Clears the MMX state by setting the state of the x87 stack registers
    +///    to empty.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> EMMS </c> instruction.
    +///
    +static __inline__ void
    +#ifdef __GNUC__
    +__attribute__((__gnu_inline__, __always_inline__, __artificial__))
    +#else
    +__attribute__((__always_inline__, __nodebug__, __target__("mmx")))
    +#endif
    +_mm_empty(void)
    +{
    +    __builtin_ia32_emms();
    +}
    +
    +/// Constructs a 64-bit integer vector, setting the lower 32 bits to the
    +///    value of the 32-bit integer parameter and setting the upper 32 bits 
to 0.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> MOVD </c> instruction.
    +///
    +/// \param __i
    +///    A 32-bit integer value.
    +/// \returns A 64-bit integer vector. The lower 32 bits contain the value 
of the
    +///    parameter. The upper 32 bits are set to 0.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_cvtsi32_si64(int __i)
    +{
    +    return (__m64)__builtin_ia32_vec_init_v2si(__i, 0);
    +}
    +
    +/// Returns the lower 32 bits of a 64-bit integer vector as a 32-bit
    +///    signed integer.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> MOVD </c> instruction.
    +///
    +/// \param __m
    +///    A 64-bit integer vector.
    +/// \returns A 32-bit signed integer value containing the lower 32 bits of 
the
    +///    parameter.
    +static __inline__ int __DEFAULT_FN_ATTRS
    +_mm_cvtsi64_si32(__m64 __m)
    +{
    +    return __builtin_ia32_vec_ext_v2si((__v2si)__m, 0);
    +}
    +
    +/// Casts a 64-bit signed integer value into a 64-bit integer vector.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> MOVQ </c> instruction.
    +///
    +/// \param __i
    +///    A 64-bit signed integer.
    +/// \returns A 64-bit integer vector containing the same bitwise pattern 
as the
    +///    parameter.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_cvtsi64_m64(long long __i)
    +{
    +    return (__m64)__i;
    +}
    +
    +/// Casts a 64-bit integer vector into a 64-bit signed integer value.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> MOVQ </c> instruction.
    +///
    +/// \param __m
    +///    A 64-bit integer vector.
    +/// \returns A 64-bit signed integer containing the same bitwise pattern 
as the
    +///    parameter.
    +static __inline__ long long __DEFAULT_FN_ATTRS
    +_mm_cvtm64_si64(__m64 __m)
    +{
    +    return (long long)__m;
    +}
    +
    +/// Converts 16-bit signed integers from both 64-bit integer vector
    +///    parameters of [4 x i16] into 8-bit signed integer values, and 
constructs
    +///    a 64-bit integer vector of [8 x i8] as the result. Positive values
    +///    greater than 0x7F are saturated to 0x7F. Negative values less than 
0x80
    +///    are saturated to 0x80.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PACKSSWB </c> instruction.
    +///
    +/// \param __m1
    +///    A 64-bit integer vector of [4 x i16]. Each 16-bit element is 
treated as a
    +///    16-bit signed integer and is converted to an 8-bit signed integer 
with
    +///    saturation. Positive values greater than 0x7F are saturated to 0x7F.
    +///    Negative values less than 0x80 are saturated to 0x80. The converted
    +///    [4 x i8] values are written to the lower 32 bits of the result.
    +/// \param __m2
    +///    A 64-bit integer vector of [4 x i16]. Each 16-bit element is 
treated as a
    +///    16-bit signed integer and is converted to an 8-bit signed integer 
with
    +///    saturation. Positive values greater than 0x7F are saturated to 0x7F.
    +///    Negative values less than 0x80 are saturated to 0x80. The converted
    +///    [4 x i8] values are written to the upper 32 bits of the result.
    +/// \returns A 64-bit integer vector of [8 x i8] containing the converted
    +///    values.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_packs_pi16(__m64 __m1, __m64 __m2)
    +{
    +    return (__m64)__builtin_ia32_packsswb((__v4hi)__m1, (__v4hi)__m2);
    +}
    +
    +/// Converts 32-bit signed integers from both 64-bit integer vector
    +///    parameters of [2 x i32] into 16-bit signed integer values, and 
constructs
    +///    a 64-bit integer vector of [4 x i16] as the result. Positive values
    +///    greater than 0x7FFF are saturated to 0x7FFF. Negative values less 
than
    +///    0x8000 are saturated to 0x8000.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PACKSSDW </c> instruction.
    +///
    +/// \param __m1
    +///    A 64-bit integer vector of [2 x i32]. Each 32-bit element is 
treated as a
    +///    32-bit signed integer and is converted to a 16-bit signed integer 
with
    +///    saturation. Positive values greater than 0x7FFF are saturated to 
0x7FFF.
    +///    Negative values less than 0x8000 are saturated to 0x8000. The 
converted
    +///    [2 x i16] values are written to the lower 32 bits of the result.
    +/// \param __m2
    +///    A 64-bit integer vector of [2 x i32]. Each 32-bit element is 
treated as a
    +///    32-bit signed integer and is converted to a 16-bit signed integer 
with
    +///    saturation. Positive values greater than 0x7FFF are saturated to 
0x7FFF.
    +///    Negative values less than 0x8000 are saturated to 0x8000. The 
converted
    +///    [2 x i16] values are written to the upper 32 bits of the result.
    +/// \returns A 64-bit integer vector of [4 x i16] containing the converted
    +///    values.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_packs_pi32(__m64 __m1, __m64 __m2)
    +{
    +    return (__m64)__builtin_ia32_packssdw((__v2si)__m1, (__v2si)__m2);
    +}
    +
    +/// Converts 16-bit signed integers from both 64-bit integer vector
    +///    parameters of [4 x i16] into 8-bit unsigned integer values, and
    +///    constructs a 64-bit integer vector of [8 x i8] as the result. Values
    +///    greater than 0xFF are saturated to 0xFF. Values less than 0 are 
saturated
    +///    to 0.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PACKUSWB </c> instruction.
    +///
    +/// \param __m1
    +///    A 64-bit integer vector of [4 x i16]. Each 16-bit element is 
treated as a
    +///    16-bit signed integer and is converted to an 8-bit unsigned integer 
with
    +///    saturation. Values greater than 0xFF are saturated to 0xFF. Values 
less
    +///    than 0 are saturated to 0. The converted [4 x i8] values are 
written to
    +///    the lower 32 bits of the result.
    +/// \param __m2
    +///    A 64-bit integer vector of [4 x i16]. Each 16-bit element is 
treated as a
    +///    16-bit signed integer and is converted to an 8-bit unsigned integer 
with
    +///    saturation. Values greater than 0xFF are saturated to 0xFF. Values 
less
    +///    than 0 are saturated to 0. The converted [4 x i8] values are 
written to
    +///    the upper 32 bits of the result.
    +/// \returns A 64-bit integer vector of [8 x i8] containing the converted
    +///    values.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_packs_pu16(__m64 __m1, __m64 __m2)
    +{
    +    return (__m64)__builtin_ia32_packuswb((__v4hi)__m1, (__v4hi)__m2);
    +}
    +
    +/// Unpacks the upper 32 bits from two 64-bit integer vectors of [8 x i8]
    +///    and interleaves them into a 64-bit integer vector of [8 x i8].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PUNPCKHBW </c> instruction.
    +///
    +/// \param __m1
    +///    A 64-bit integer vector of [8 x i8]. \n
    +///    Bits [39:32] are written to bits [7:0] of the result. \n
    +///    Bits [47:40] are written to bits [23:16] of the result. \n
    +///    Bits [55:48] are written to bits [39:32] of the result. \n
    +///    Bits [63:56] are written to bits [55:48] of the result.
    +/// \param __m2
    +///    A 64-bit integer vector of [8 x i8].
    +///    Bits [39:32] are written to bits [15:8] of the result. \n
    +///    Bits [47:40] are written to bits [31:24] of the result. \n
    +///    Bits [55:48] are written to bits [47:40] of the result. \n
    +///    Bits [63:56] are written to bits [63:56] of the result.
    +/// \returns A 64-bit integer vector of [8 x i8] containing the interleaved
    +///    values.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
    +{
    +    return (__m64)__builtin_ia32_punpckhbw((__v8qi)__m1, (__v8qi)__m2);
    +}
    +
    +/// Unpacks the upper 32 bits from two 64-bit integer vectors of
    +///    [4 x i16] and interleaves them into a 64-bit integer vector of [4 x 
i16].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PUNPCKHWD </c> instruction.
    +///
    +/// \param __m1
    +///    A 64-bit integer vector of [4 x i16].
    +///    Bits [47:32] are written to bits [15:0] of the result. \n
    +///    Bits [63:48] are written to bits [47:32] of the result.
    +/// \param __m2
    +///    A 64-bit integer vector of [4 x i16].
    +///    Bits [47:32] are written to bits [31:16] of the result. \n
    +///    Bits [63:48] are written to bits [63:48] of the result.
    +/// \returns A 64-bit integer vector of [4 x i16] containing the 
interleaved
    +///    values.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
    +{
    +    return (__m64)__builtin_ia32_punpckhwd((__v4hi)__m1, (__v4hi)__m2);
    +}
    +
    +/// Unpacks the upper 32 bits from two 64-bit integer vectors of
    +///    [2 x i32] and interleaves them into a 64-bit integer vector of [2 x 
i32].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PUNPCKHDQ </c> instruction.
    +///
    +/// \param __m1
    +///    A 64-bit integer vector of [2 x i32]. The upper 32 bits are written 
to
    +///    the lower 32 bits of the result.
    +/// \param __m2
    +///    A 64-bit integer vector of [2 x i32]. The upper 32 bits are written 
to
    +///    the upper 32 bits of the result.
    +/// \returns A 64-bit integer vector of [2 x i32] containing the 
interleaved
    +///    values.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
    +{
    +    return (__m64)__builtin_ia32_punpckhdq((__v2si)__m1, (__v2si)__m2);
    +}
    +
    +/// Unpacks the lower 32 bits from two 64-bit integer vectors of [8 x i8]
    +///    and interleaves them into a 64-bit integer vector of [8 x i8].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PUNPCKLBW </c> instruction.
    +///
    +/// \param __m1
    +///    A 64-bit integer vector of [8 x i8].
    +///    Bits [7:0] are written to bits [7:0] of the result. \n
    +///    Bits [15:8] are written to bits [23:16] of the result. \n
    +///    Bits [23:16] are written to bits [39:32] of the result. \n
    +///    Bits [31:24] are written to bits [55:48] of the result.
    +/// \param __m2
    +///    A 64-bit integer vector of [8 x i8].
    +///    Bits [7:0] are written to bits [15:8] of the result. \n
    +///    Bits [15:8] are written to bits [31:24] of the result. \n
    +///    Bits [23:16] are written to bits [47:40] of the result. \n
    +///    Bits [31:24] are written to bits [63:56] of the result.
    +/// \returns A 64-bit integer vector of [8 x i8] containing the interleaved
    +///    values.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
    +{
    +    return (__m64)__builtin_ia32_punpcklbw((__v8qi)__m1, (__v8qi)__m2);
    +}
    +
    +/// Unpacks the lower 32 bits from two 64-bit integer vectors of
    +///    [4 x i16] and interleaves them into a 64-bit integer vector of [4 x 
i16].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PUNPCKLWD </c> instruction.
    +///
    +/// \param __m1
    +///    A 64-bit integer vector of [4 x i16].
    +///    Bits [15:0] are written to bits [15:0] of the result. \n
    +///    Bits [31:16] are written to bits [47:32] of the result.
    +/// \param __m2
    +///    A 64-bit integer vector of [4 x i16].
    +///    Bits [15:0] are written to bits [31:16] of the result. \n
    +///    Bits [31:16] are written to bits [63:48] of the result.
    +/// \returns A 64-bit integer vector of [4 x i16] containing the 
interleaved
    +///    values.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
    +{
    +    return (__m64)__builtin_ia32_punpcklwd((__v4hi)__m1, (__v4hi)__m2);
    +}
    +
    +/// Unpacks the lower 32 bits from two 64-bit integer vectors of
    +///    [2 x i32] and interleaves them into a 64-bit integer vector of [2 x 
i32].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PUNPCKLDQ </c> instruction.
    +///
    +/// \param __m1
    +///    A 64-bit integer vector of [2 x i32]. The lower 32 bits are written 
to
    +///    the lower 32 bits of the result.
    +/// \param __m2
    +///    A 64-bit integer vector of [2 x i32]. The lower 32 bits are written 
to
    +///    the upper 32 bits of the result.
    +/// \returns A 64-bit integer vector of [2 x i32] containing the 
interleaved
    +///    values.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
    +{
    +    return (__m64)__builtin_ia32_punpckldq((__v2si)__m1, (__v2si)__m2);
    +}
    +
    +/// Adds each 8-bit integer element of the first 64-bit integer vector
    +///    of [8 x i8] to the corresponding 8-bit integer element of the second
    +///    64-bit integer vector of [8 x i8]. The lower 8 bits of the results 
are
    +///    packed into a 64-bit integer vector of [8 x i8].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PADDB </c> instruction.
    +///
    +/// \param __m1
    +///    A 64-bit integer vector of [8 x i8].
    +/// \param __m2
    +///    A 64-bit integer vector of [8 x i8].
    +/// \returns A 64-bit integer vector of [8 x i8] containing the sums of 
both
    +///    parameters.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_add_pi8(__m64 __m1, __m64 __m2)
    +{
    +    return (__m64)__builtin_ia32_paddb((__v8qi)__m1, (__v8qi)__m2);
    +}
    +
    +/// Adds each 16-bit integer element of the first 64-bit integer vector
    +///    of [4 x i16] to the corresponding 16-bit integer element of the 
second
    +///    64-bit integer vector of [4 x i16]. The lower 16 bits of the 
results are
    +///    packed into a 64-bit integer vector of [4 x i16].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PADDW </c> instruction.
    +///
    +/// \param __m1
    +///    A 64-bit integer vector of [4 x i16].
    +/// \param __m2
    +///    A 64-bit integer vector of [4 x i16].
    +/// \returns A 64-bit integer vector of [4 x i16] containing the sums of 
both
    +///    parameters.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_add_pi16(__m64 __m1, __m64 __m2)
    +{
    +    return (__m64)__builtin_ia32_paddw((__v4hi)__m1, (__v4hi)__m2);
    +}
    +
    +/// Adds each 32-bit integer element of the first 64-bit integer vector
    +///    of [2 x i32] to the corresponding 32-bit integer element of the 
second
    +///    64-bit integer vector of [2 x i32]. The lower 32 bits of the 
results are
    +///    packed into a 64-bit integer vector of [2 x i32].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PADDD </c> instruction.
    +///
    +/// \param __m1
    +///    A 64-bit integer vector of [2 x i32].
    +/// \param __m2
    +///    A 64-bit integer vector of [2 x i32].
    +/// \returns A 64-bit integer vector of [2 x i32] containing the sums of 
both
    +///    parameters.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_add_pi32(__m64 __m1, __m64 __m2)
    +{
    +    return (__m64)__builtin_ia32_paddd((__v2si)__m1, (__v2si)__m2);
    +}
    +
    +/// Adds each 8-bit signed integer element of the first 64-bit integer
    +///    vector of [8 x i8] to the corresponding 8-bit signed integer 
element of
    +///    the second 64-bit integer vector of [8 x i8]. Positive sums greater 
than
    +///    0x7F are saturated to 0x7F. Negative sums less than 0x80 are 
saturated to
    +///    0x80. The results are packed into a 64-bit integer vector of [8 x 
i8].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PADDSB </c> instruction.
    +///
    +/// \param __m1
    +///    A 64-bit integer vector of [8 x i8].
    +/// \param __m2
    +///    A 64-bit integer vector of [8 x i8].
    +/// \returns A 64-bit integer vector of [8 x i8] containing the saturated 
sums
    +///    of both parameters.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_adds_pi8(__m64 __m1, __m64 __m2)
    +{
    +    return (__m64)__builtin_ia32_paddsb((__v8qi)__m1, (__v8qi)__m2);
    +}
    +
    +/// Adds each 16-bit signed integer element of the first 64-bit integer
    +///    vector of [4 x i16] to the corresponding 16-bit signed integer 
element of
    +///    the second 64-bit integer vector of [4 x i16]. Positive sums 
greater than
    +///    0x7FFF are saturated to 0x7FFF. Negative sums less than 0x8000 are
    +///    saturated to 0x8000. The results are packed into a 64-bit integer 
vector
    +///    of [4 x i16].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PADDSW </c> instruction.
    +///
    +/// \param __m1
    +///    A 64-bit integer vector of [4 x i16].
    +/// \param __m2
    +///    A 64-bit integer vector of [4 x i16].
    +/// \returns A 64-bit integer vector of [4 x i16] containing the saturated 
sums
    +///    of both parameters.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_adds_pi16(__m64 __m1, __m64 __m2)
    +{
    +    return (__m64)__builtin_ia32_paddsw((__v4hi)__m1, (__v4hi)__m2);
    +}
    +
    +/// Adds each 8-bit unsigned integer element of the first 64-bit integer
    +///    vector of [8 x i8] to the corresponding 8-bit unsigned integer 
element of
    +///    the second 64-bit integer vector of [8 x i8]. Sums greater than 
0xFF are
    +///    saturated to 0xFF. The results are packed into a 64-bit integer 
vector of
    +///    [8 x i8].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PADDUSB </c> instruction.
    +///
    +/// \param __m1
    +///    A 64-bit integer vector of [8 x i8].
    +/// \param __m2
    +///    A 64-bit integer vector of [8 x i8].
    +/// \returns A 64-bit integer vector of [8 x i8] containing the saturated
    +///    unsigned sums of both parameters.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_adds_pu8(__m64 __m1, __m64 __m2)
    +{
    +    return (__m64)__builtin_ia32_paddusb((__v8qi)__m1, (__v8qi)__m2);
    +}
    +
    +/// Adds each 16-bit unsigned integer element of the first 64-bit integer
    +///    vector of [4 x i16] to the corresponding 16-bit unsigned integer 
element
    +///    of the second 64-bit integer vector of [4 x i16]. Sums greater than
    +///    0xFFFF are saturated to 0xFFFF. The results are packed into a 64-bit
    +///    integer vector of [4 x i16].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PADDUSW </c> instruction.
    +///
    +/// \param __m1
    +///    A 64-bit integer vector of [4 x i16].
    +/// \param __m2
    +///    A 64-bit integer vector of [4 x i16].
    +/// \returns A 64-bit integer vector of [4 x i16] containing the saturated
    +///    unsigned sums of both parameters.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_adds_pu16(__m64 __m1, __m64 __m2)
    +{
    +    return (__m64)__builtin_ia32_paddusw((__v4hi)__m1, (__v4hi)__m2);
    +}
    +
    +/// Subtracts each 8-bit integer element of the second 64-bit integer
    +///    vector of [8 x i8] from the corresponding 8-bit integer element of 
the
    +///    first 64-bit integer vector of [8 x i8]. The lower 8 bits of the 
results
    +///    are packed into a 64-bit integer vector of [8 x i8].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PSUBB </c> instruction.
    +///
    +/// \param __m1
    +///    A 64-bit integer vector of [8 x i8] containing the minuends.
    +/// \param __m2
    +///    A 64-bit integer vector of [8 x i8] containing the subtrahends.
    +/// \returns A 64-bit integer vector of [8 x i8] containing the 
differences of
    +///    both parameters.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_sub_pi8(__m64 __m1, __m64 __m2)
    +{
    +    return (__m64)__builtin_ia32_psubb((__v8qi)__m1, (__v8qi)__m2);
    +}
    +
    +/// Subtracts each 16-bit integer element of the second 64-bit integer
    +///    vector of [4 x i16] from the corresponding 16-bit integer element 
of the
    +///    first 64-bit integer vector of [4 x i16]. The lower 16 bits of the
    +///    results are packed into a 64-bit integer vector of [4 x i16].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PSUBW </c> instruction.
    +///
    +/// \param __m1
    +///    A 64-bit integer vector of [4 x i16] containing the minuends.
    +/// \param __m2
    +///    A 64-bit integer vector of [4 x i16] containing the subtrahends.
    +/// \returns A 64-bit integer vector of [4 x i16] containing the 
differences of
    +///    both parameters.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_sub_pi16(__m64 __m1, __m64 __m2)
    +{
    +    return (__m64)__builtin_ia32_psubw((__v4hi)__m1, (__v4hi)__m2);
    +}
    +
    +/// Subtracts each 32-bit integer element of the second 64-bit integer
    +///    vector of [2 x i32] from the corresponding 32-bit integer element 
of the
    +///    first 64-bit integer vector of [2 x i32]. The lower 32 bits of the
    +///    results are packed into a 64-bit integer vector of [2 x i32].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PSUBD </c> instruction.
    +///
    +/// \param __m1
    +///    A 64-bit integer vector of [2 x i32] containing the minuends.
    +/// \param __m2
    +///    A 64-bit integer vector of [2 x i32] containing the subtrahends.
    +/// \returns A 64-bit integer vector of [2 x i32] containing the 
differences of
    +///    both parameters.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_sub_pi32(__m64 __m1, __m64 __m2)
    +{
    +    return (__m64)__builtin_ia32_psubd((__v2si)__m1, (__v2si)__m2);
    +}
    +
    +/// Subtracts each 8-bit signed integer element of the second 64-bit
    +///    integer vector of [8 x i8] from the corresponding 8-bit signed 
integer
    +///    element of the first 64-bit integer vector of [8 x i8]. Positive 
results
    +///    greater than 0x7F are saturated to 0x7F. Negative results less than 
0x80
    +///    are saturated to 0x80. The results are packed into a 64-bit integer
    +///    vector of [8 x i8].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PSUBSB </c> instruction.
    +///
    +/// \param __m1
    +///    A 64-bit integer vector of [8 x i8] containing the minuends.
    +/// \param __m2
    +///    A 64-bit integer vector of [8 x i8] containing the subtrahends.
    +/// \returns A 64-bit integer vector of [8 x i8] containing the saturated
    +///    differences of both parameters.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_subs_pi8(__m64 __m1, __m64 __m2)
    +{
    +    return (__m64)__builtin_ia32_psubsb((__v8qi)__m1, (__v8qi)__m2);
    +}
    +
    +/// Subtracts each 16-bit signed integer element of the second 64-bit
    +///    integer vector of [4 x i16] from the corresponding 16-bit signed 
integer
    +///    element of the first 64-bit integer vector of [4 x i16]. Positive 
results
    +///    greater than 0x7FFF are saturated to 0x7FFF. Negative results less 
than
    +///    0x8000 are saturated to 0x8000. The results are packed into a 64-bit
    +///    integer vector of [4 x i16].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PSUBSW </c> instruction.
    +///
    +/// \param __m1
    +///    A 64-bit integer vector of [4 x i16] containing the minuends.
    +/// \param __m2
    +///    A 64-bit integer vector of [4 x i16] containing the subtrahends.
    +/// \returns A 64-bit integer vector of [4 x i16] containing the saturated
    +///    differences of both parameters.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_subs_pi16(__m64 __m1, __m64 __m2)
    +{
    +    return (__m64)__builtin_ia32_psubsw((__v4hi)__m1, (__v4hi)__m2);
    +}
    +
    +/// Subtracts each 8-bit unsigned integer element of the second 64-bit
    +///    integer vector of [8 x i8] from the corresponding 8-bit unsigned 
integer
    +///    element of the first 64-bit integer vector of [8 x i8].
    +///
    +///    If an element of the first vector is less than the corresponding 
element
    +///    of the second vector, the result is saturated to 0. The results are
    +///    packed into a 64-bit integer vector of [8 x i8].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PSUBUSB </c> instruction.
    +///
    +/// \param __m1
    +///    A 64-bit integer vector of [8 x i8] containing the minuends.
    +/// \param __m2
    +///    A 64-bit integer vector of [8 x i8] containing the subtrahends.
    +/// \returns A 64-bit integer vector of [8 x i8] containing the saturated
    +///    differences of both parameters.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_subs_pu8(__m64 __m1, __m64 __m2)
    +{
    +    return (__m64)__builtin_ia32_psubusb((__v8qi)__m1, (__v8qi)__m2);
    +}
    +
    +/// Subtracts each 16-bit unsigned integer element of the second 64-bit
    +///    integer vector of [4 x i16] from the corresponding 16-bit unsigned
    +///    integer element of the first 64-bit integer vector of [4 x i16].
    +///
    +///    If an element of the first vector is less than the corresponding 
element
    +///    of the second vector, the result is saturated to 0. The results are
    +///    packed into a 64-bit integer vector of [4 x i16].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PSUBUSW </c> instruction.
    +///
    +/// \param __m1
    +///    A 64-bit integer vector of [4 x i16] containing the minuends.
    +/// \param __m2
    +///    A 64-bit integer vector of [4 x i16] containing the subtrahends.
    +/// \returns A 64-bit integer vector of [4 x i16] containing the saturated
    +///    differences of both parameters.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_subs_pu16(__m64 __m1, __m64 __m2)
    +{
    +    return (__m64)__builtin_ia32_psubusw((__v4hi)__m1, (__v4hi)__m2);
    +}
    +
    +/// Multiplies each 16-bit signed integer element of the first 64-bit
    +///    integer vector of [4 x i16] by the corresponding 16-bit signed 
integer
    +///    element of the second 64-bit integer vector of [4 x i16] and get 
four
    +///    32-bit products. Adds adjacent pairs of products to get two 32-bit 
sums.
    +///    The lower 32 bits of these two sums are packed into a 64-bit integer
    +///    vector of [2 x i32].
    +///
    +///    For example, bits [15:0] of both parameters are multiplied, bits 
[31:16]
    +///    of both parameters are multiplied, and the sum of both results is 
written
    +///    to bits [31:0] of the result.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PMADDWD </c> instruction.
    +///
    +/// \param __m1
    +///    A 64-bit integer vector of [4 x i16].
    +/// \param __m2
    +///    A 64-bit integer vector of [4 x i16].
    +/// \returns A 64-bit integer vector of [2 x i32] containing the sums of
    +///    products of both parameters.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_madd_pi16(__m64 __m1, __m64 __m2)
    +{
    +    return (__m64)__builtin_ia32_pmaddwd((__v4hi)__m1, (__v4hi)__m2);
    +}
    +
    +/// Multiplies each 16-bit signed integer element of the first 64-bit
    +///    integer vector of [4 x i16] by the corresponding 16-bit signed 
integer
    +///    element of the second 64-bit integer vector of [4 x i16]. Packs the 
upper
    +///    16 bits of the 32-bit products into a 64-bit integer vector of [4 x 
i16].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PMULHW </c> instruction.
    +///
    +/// \param __m1
    +///    A 64-bit integer vector of [4 x i16].
    +/// \param __m2
    +///    A 64-bit integer vector of [4 x i16].
    +/// \returns A 64-bit integer vector of [4 x i16] containing the upper 16 
bits
    +///    of the products of both parameters.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_mulhi_pi16(__m64 __m1, __m64 __m2)
    +{
    +    return (__m64)__builtin_ia32_pmulhw((__v4hi)__m1, (__v4hi)__m2);
    +}
    +
    +/// Multiplies each 16-bit signed integer element of the first 64-bit
    +///    integer vector of [4 x i16] by the corresponding 16-bit signed 
integer
    +///    element of the second 64-bit integer vector of [4 x i16]. Packs the 
lower
    +///    16 bits of the 32-bit products into a 64-bit integer vector of [4 x 
i16].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PMULLW </c> instruction.
    +///
    +/// \param __m1
    +///    A 64-bit integer vector of [4 x i16].
    +/// \param __m2
    +///    A 64-bit integer vector of [4 x i16].
    +/// \returns A 64-bit integer vector of [4 x i16] containing the lower 16 
bits
    +///    of the products of both parameters.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_mullo_pi16(__m64 __m1, __m64 __m2)
    +{
    +    return (__m64)__builtin_ia32_pmullw((__v4hi)__m1, (__v4hi)__m2);
    +}
    +
    +/// Left-shifts each 16-bit signed integer element of the first
    +///    parameter, which is a 64-bit integer vector of [4 x i16], by the 
number
    +///    of bits specified by the second parameter, which is a 64-bit 
integer. The
    +///    lower 16 bits of the results are packed into a 64-bit integer 
vector of
    +///    [4 x i16].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PSLLW </c> instruction.
    +///
    +/// \param __m
    +///    A 64-bit integer vector of [4 x i16].
    +/// \param __count
    +///    A 64-bit integer vector interpreted as a single 64-bit integer.
    +/// \returns A 64-bit integer vector of [4 x i16] containing the 
left-shifted
    +///    values. If \a __count is greater or equal to 16, the result is set 
to all
    +///    0.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_sll_pi16(__m64 __m, __m64 __count)
    +{
    +#ifdef __GNUC__
    +    return (__m64) __builtin_ia32_psllw ((__v4hi)__m, (__v4hi)__count);
    +#else
    +    return (__m64)__builtin_ia32_psllw((__v4hi)__m, __count);
    +#endif
    +}
    +
    +/// Left-shifts each 16-bit signed integer element of a 64-bit integer
    +///    vector of [4 x i16] by the number of bits specified by a 32-bit 
integer.
    +///    The lower 16 bits of the results are packed into a 64-bit integer 
vector
    +///    of [4 x i16].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PSLLW </c> instruction.
    +///
    +/// \param __m
    +///    A 64-bit integer vector of [4 x i16].
    +/// \param __count
    +///    A 32-bit integer value.
    +/// \returns A 64-bit integer vector of [4 x i16] containing the 
left-shifted
    +///    values. If \a __count is greater or equal to 16, the result is set 
to all
    +///    0.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_slli_pi16(__m64 __m, int __count)
    +{
    +    return (__m64)__builtin_ia32_psllwi((__v4hi)__m, __count);
    +}
    +
    +/// Left-shifts each 32-bit signed integer element of the first
    +///    parameter, which is a 64-bit integer vector of [2 x i32], by the 
number
    +///    of bits specified by the second parameter, which is a 64-bit 
integer. The
    +///    lower 32 bits of the results are packed into a 64-bit integer 
vector of
    +///    [2 x i32].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PSLLD </c> instruction.
    +///
    +/// \param __m
    +///    A 64-bit integer vector of [2 x i32].
    +/// \param __count
    +///    A 64-bit integer vector interpreted as a single 64-bit integer.
    +/// \returns A 64-bit integer vector of [2 x i32] containing the 
left-shifted
    +///    values. If \a __count is greater or equal to 32, the result is set 
to all
    +///    0.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_sll_pi32(__m64 __m, __m64 __count)
    +{
    +    return (__m64)__builtin_ia32_pslld((__v2si)__m, (__v2si)__count);
    +}
    +
    +/// Left-shifts each 32-bit signed integer element of a 64-bit integer
    +///    vector of [2 x i32] by the number of bits specified by a 32-bit 
integer.
    +///    The lower 32 bits of the results are packed into a 64-bit integer 
vector
    +///    of [2 x i32].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PSLLD </c> instruction.
    +///
    +/// \param __m
    +///    A 64-bit integer vector of [2 x i32].
    +/// \param __count
    +///    A 32-bit integer value.
    +/// \returns A 64-bit integer vector of [2 x i32] containing the 
left-shifted
    +///    values. If \a __count is greater or equal to 32, the result is set 
to all
    +///    0.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_slli_pi32(__m64 __m, int __count)
    +{
    +    return (__m64)__builtin_ia32_pslldi((__v2si)__m, __count);
    +}
    +
    +/// Left-shifts the first 64-bit integer parameter by the number of bits
    +///    specified by the second 64-bit integer parameter. The lower 64 bits 
of
    +///    result are returned.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PSLLQ </c> instruction.
    +///
    +/// \param __m
    +///    A 64-bit integer vector interpreted as a single 64-bit integer.
    +/// \param __count
    +///    A 64-bit integer vector interpreted as a single 64-bit integer.
    +/// \returns A 64-bit integer vector containing the left-shifted value. If
    +///     \a __count is greater or equal to 64, the result is set to 0.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_sll_si64(__m64 __m, __m64 __count)
    +{
    +    return (__m64)__builtin_ia32_psllq((__v1di)__m, (__v1di)__count);
    +}
    +
    +/// Left-shifts the first parameter, which is a 64-bit integer, by the
    +///    number of bits specified by the second parameter, which is a 32-bit
    +///    integer. The lower 64 bits of result are returned.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PSLLQ </c> instruction.
    +///
    +/// \param __m
    +///    A 64-bit integer vector interpreted as a single 64-bit integer.
    +/// \param __count
    +///    A 32-bit integer value.
    +/// \returns A 64-bit integer vector containing the left-shifted value. If
    +///     \a __count is greater or equal to 64, the result is set to 0.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_slli_si64(__m64 __m, int __count)
    +{
    +    return (__m64)__builtin_ia32_psllqi((__v1di)__m, __count);
    +}
    +
    +/// Right-shifts each 16-bit integer element of the first parameter,
    +///    which is a 64-bit integer vector of [4 x i16], by the number of bits
    +///    specified by the second parameter, which is a 64-bit integer.
    +///
    +///    High-order bits are filled with the sign bit of the initial value 
of each
    +///    16-bit element. The 16-bit results are packed into a 64-bit integer
    +///    vector of [4 x i16].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PSRAW </c> instruction.
    +///
    +/// \param __m
    +///    A 64-bit integer vector of [4 x i16].
    +/// \param __count
    +///    A 64-bit integer vector interpreted as a single 64-bit integer.
    +/// \returns A 64-bit integer vector of [4 x i16] containing the 
right-shifted
    +///    values.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_sra_pi16(__m64 __m, __m64 __count)
    +{
    +    return (__m64)__builtin_ia32_psraw((__v4hi)__m, (__v4hi)__count);
    +}
    +
    +/// Right-shifts each 16-bit integer element of a 64-bit integer vector
    +///    of [4 x i16] by the number of bits specified by a 32-bit integer.
    +///
    +///    High-order bits are filled with the sign bit of the initial value 
of each
    +///    16-bit element. The 16-bit results are packed into a 64-bit integer
    +///    vector of [4 x i16].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PSRAW </c> instruction.
    +///
    +/// \param __m
    +///    A 64-bit integer vector of [4 x i16].
    +/// \param __count
    +///    A 32-bit integer value.
    +/// \returns A 64-bit integer vector of [4 x i16] containing the 
right-shifted
    +///    values.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_srai_pi16(__m64 __m, int __count)
    +{
    +    return (__m64)__builtin_ia32_psrawi((__v4hi)__m, __count);
    +}
    +
    +/// Right-shifts each 32-bit integer element of the first parameter,
    +///    which is a 64-bit integer vector of [2 x i32], by the number of bits
    +///    specified by the second parameter, which is a 64-bit integer.
    +///
    +///    High-order bits are filled with the sign bit of the initial value 
of each
    +///    32-bit element. The 32-bit results are packed into a 64-bit integer
    +///    vector of [2 x i32].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PSRAD </c> instruction.
    +///
    +/// \param __m
    +///    A 64-bit integer vector of [2 x i32].
    +/// \param __count
    +///    A 64-bit integer vector interpreted as a single 64-bit integer.
    +/// \returns A 64-bit integer vector of [2 x i32] containing the 
right-shifted
    +///    values.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_sra_pi32(__m64 __m, __m64 __count)
    +{
    +    return (__m64)__builtin_ia32_psrad((__v2si)__m, (__v2si)__count);
    +}
    +
    +/// Right-shifts each 32-bit integer element of a 64-bit integer vector
    +///    of [2 x i32] by the number of bits specified by a 32-bit integer.
    +///
    +///    High-order bits are filled with the sign bit of the initial value 
of each
    +///    32-bit element. The 32-bit results are packed into a 64-bit integer
    +///    vector of [2 x i32].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PSRAD </c> instruction.
    +///
    +/// \param __m
    +///    A 64-bit integer vector of [2 x i32].
    +/// \param __count
    +///    A 32-bit integer value.
    +/// \returns A 64-bit integer vector of [2 x i32] containing the 
right-shifted
    +///    values.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_srai_pi32(__m64 __m, int __count)
    +{
    +    return (__m64)__builtin_ia32_psradi((__v2si)__m, __count);
    +}
    +
    +/// Right-shifts each 16-bit integer element of the first parameter,
    +///    which is a 64-bit integer vector of [4 x i16], by the number of bits
    +///    specified by the second parameter, which is a 64-bit integer.
    +///
    +///    High-order bits are cleared. The 16-bit results are packed into a 
64-bit
    +///    integer vector of [4 x i16].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PSRLW </c> instruction.
    +///
    +/// \param __m
    +///    A 64-bit integer vector of [4 x i16].
    +/// \param __count
    +///    A 64-bit integer vector interpreted as a single 64-bit integer.
    +/// \returns A 64-bit integer vector of [4 x i16] containing the 
right-shifted
    +///    values.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_srl_pi16(__m64 __m, __m64 __count)
    +{
    +    return (__m64)__builtin_ia32_psrlw((__v4hi)__m, (__v4hi)__count);
    +}
    +
    +/// Right-shifts each 16-bit integer element of a 64-bit integer vector
    +///    of [4 x i16] by the number of bits specified by a 32-bit integer.
    +///
    +///    High-order bits are cleared. The 16-bit results are packed into a 
64-bit
    +///    integer vector of [4 x i16].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PSRLW </c> instruction.
    +///
    +/// \param __m
    +///    A 64-bit integer vector of [4 x i16].
    +/// \param __count
    +///    A 32-bit integer value.
    +/// \returns A 64-bit integer vector of [4 x i16] containing the 
right-shifted
    +///    values.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_srli_pi16(__m64 __m, int __count)
    +{
    +    return (__m64)__builtin_ia32_psrlwi((__v4hi)__m, __count);
    +}
    +
    +/// Right-shifts each 32-bit integer element of the first parameter,
    +///    which is a 64-bit integer vector of [2 x i32], by the number of bits
    +///    specified by the second parameter, which is a 64-bit integer.
    +///
    +///    High-order bits are cleared. The 32-bit results are packed into a 
64-bit
    +///    integer vector of [2 x i32].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PSRLD </c> instruction.
    +///
    +/// \param __m
    +///    A 64-bit integer vector of [2 x i32].
    +/// \param __count
    +///    A 64-bit integer vector interpreted as a single 64-bit integer.
    +/// \returns A 64-bit integer vector of [2 x i32] containing the 
right-shifted
    +///    values.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_srl_pi32(__m64 __m, __m64 __count)
    +{
    +    return (__m64)__builtin_ia32_psrld((__v2si)__m, (__v2si)__count);
    +}
    +
    +/// Right-shifts each 32-bit integer element of a 64-bit integer vector
    +///    of [2 x i32] by the number of bits specified by a 32-bit integer.
    +///
    +///    High-order bits are cleared. The 32-bit results are packed into a 
64-bit
    +///    integer vector of [2 x i32].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PSRLD </c> instruction.
    +///
    +/// \param __m
    +///    A 64-bit integer vector of [2 x i32].
    +/// \param __count
    +///    A 32-bit integer value.
    +/// \returns A 64-bit integer vector of [2 x i32] containing the 
right-shifted
    +///    values.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_srli_pi32(__m64 __m, int __count)
    +{
    +    return (__m64)__builtin_ia32_psrldi((__v2si)__m, __count);
    +}
    +
    +/// Right-shifts the first 64-bit integer parameter by the number of bits
    +///    specified by the second 64-bit integer parameter.
    +///
    +///    High-order bits are cleared.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PSRLQ </c> instruction.
    +///
    +/// \param __m
    +///    A 64-bit integer vector interpreted as a single 64-bit integer.
    +/// \param __count
    +///    A 64-bit integer vector interpreted as a single 64-bit integer.
    +/// \returns A 64-bit integer vector containing the right-shifted value.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_srl_si64(__m64 __m, __m64 __count)
    +{
    +    return (__m64)__builtin_ia32_psrlq((__v1di)__m, (__v1di)__count);
    +}
    +
    +/// Right-shifts the first parameter, which is a 64-bit integer, by the
    +///    number of bits specified by the second parameter, which is a 32-bit
    +///    integer.
    +///
    +///    High-order bits are cleared.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PSRLQ </c> instruction.
    +///
    +/// \param __m
    +///    A 64-bit integer vector interpreted as a single 64-bit integer.
    +/// \param __count
    +///    A 32-bit integer value.
    +/// \returns A 64-bit integer vector containing the right-shifted value.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_srli_si64(__m64 __m, int __count)
    +{
    +    return (__m64)__builtin_ia32_psrlqi((__v1di)__m, __count);
    +}
    +
    +/// Performs a bitwise AND of two 64-bit integer vectors.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PAND </c> instruction.
    +///
    +/// \param __m1
    +///    A 64-bit integer vector.
    +/// \param __m2
    +///    A 64-bit integer vector.
    +/// \returns A 64-bit integer vector containing the bitwise AND of both
    +///    parameters.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_and_si64(__m64 __m1, __m64 __m2)
    +{
    +    return __builtin_ia32_pand(__m1, __m2);
    +}
    +
    +/// Performs a bitwise NOT of the first 64-bit integer vector, and then
    +///    performs a bitwise AND of the intermediate result and the second 
64-bit
    +///    integer vector.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PANDN </c> instruction.
    +///
    +/// \param __m1
    +///    A 64-bit integer vector. The one's complement of this parameter is 
used
    +///    in the bitwise AND.
    +/// \param __m2
    +///    A 64-bit integer vector.
    +/// \returns A 64-bit integer vector containing the bitwise AND of the 
second
    +///    parameter and the one's complement of the first parameter.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_andnot_si64(__m64 __m1, __m64 __m2)
    +{
    +#ifdef __GNUC__
    +    return __builtin_ia32_pandn (__m1, __m2);
    +#else
    +    return __builtin_ia32_pandn((__v1di)__m1, (__v1di)__m2);
    +#endif
    +}
    +
    +/// Performs a bitwise OR of two 64-bit integer vectors.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> POR </c> instruction.
    +///
    +/// \param __m1
    +///    A 64-bit integer vector.
    +/// \param __m2
    +///    A 64-bit integer vector.
    +/// \returns A 64-bit integer vector containing the bitwise OR of both
    +///    parameters.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_or_si64(__m64 __m1, __m64 __m2)
    +{
    +#ifdef __GNUC__
    +    return __builtin_ia32_por(__m1, __m2);
    +#else
    +    return __builtin_ia32_por((__v1di)__m1, (__v1di)__m2);
    +#endif
    +}
    +
    +/// Performs a bitwise exclusive OR of two 64-bit integer vectors.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PXOR </c> instruction.
    +///
    +/// \param __m1
    +///    A 64-bit integer vector.
    +/// \param __m2
    +///    A 64-bit integer vector.
    +/// \returns A 64-bit integer vector containing the bitwise exclusive OR 
of both
    +///    parameters.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_xor_si64(__m64 __m1, __m64 __m2)
    +{
    +    return __builtin_ia32_pxor (__m1, __m2);
    +}
    +
    +/// Compares the 8-bit integer elements of two 64-bit integer vectors of
    +///    [8 x i8] to determine if the element of the first vector is equal 
to the
    +///    corresponding element of the second vector.
    +///
    +///    The comparison yields 0 for false, 0xFF for true.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PCMPEQB </c> instruction.
    +///
    +/// \param __m1
    +///    A 64-bit integer vector of [8 x i8].
    +/// \param __m2
    +///    A 64-bit integer vector of [8 x i8].
    +/// \returns A 64-bit integer vector of [8 x i8] containing the comparison
    +///    results.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
    +{
    +    return (__m64)__builtin_ia32_pcmpeqb((__v8qi)__m1, (__v8qi)__m2);
    +}
    +
    +/// Compares the 16-bit integer elements of two 64-bit integer vectors of
    +///    [4 x i16] to determine if the element of the first vector is equal 
to the
    +///    corresponding element of the second vector.
    +///
    +///    The comparison yields 0 for false, 0xFFFF for true.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PCMPEQW </c> instruction.
    +///
    +/// \param __m1
    +///    A 64-bit integer vector of [4 x i16].
    +/// \param __m2
    +///    A 64-bit integer vector of [4 x i16].
    +/// \returns A 64-bit integer vector of [4 x i16] containing the comparison
    +///    results.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
    +{
    +    return (__m64)__builtin_ia32_pcmpeqw((__v4hi)__m1, (__v4hi)__m2);
    +}
    +
    +/// Compares the 32-bit integer elements of two 64-bit integer vectors of
    +///    [2 x i32] to determine if the element of the first vector is equal 
to the
    +///    corresponding element of the second vector.
    +///
    +///    The comparison yields 0 for false, 0xFFFFFFFF for true.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PCMPEQD </c> instruction.
    +///
    +/// \param __m1
    +///    A 64-bit integer vector of [2 x i32].
    +/// \param __m2
    +///    A 64-bit integer vector of [2 x i32].
    +/// \returns A 64-bit integer vector of [2 x i32] containing the comparison
    +///    results.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
    +{
    +    return (__m64)__builtin_ia32_pcmpeqd((__v2si)__m1, (__v2si)__m2);
    +}
    +
    +/// Compares the 8-bit integer elements of two 64-bit integer vectors of
    +///    [8 x i8] to determine if the element of the first vector is greater 
than
    +///    the corresponding element of the second vector.
    +///
    +///    The comparison yields 0 for false, 0xFF for true.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PCMPGTB </c> instruction.
    +///
    +/// \param __m1
    +///    A 64-bit integer vector of [8 x i8].
    +/// \param __m2
    +///    A 64-bit integer vector of [8 x i8].
    +/// \returns A 64-bit integer vector of [8 x i8] containing the comparison
    +///    results.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
    +{
    +    return (__m64)__builtin_ia32_pcmpgtb((__v8qi)__m1, (__v8qi)__m2);
    +}
    +
    +/// Compares the 16-bit integer elements of two 64-bit integer vectors of
    +///    [4 x i16] to determine if the element of the first vector is 
greater than
    +///    the corresponding element of the second vector.
    +///
    +///    The comparison yields 0 for false, 0xFFFF for true.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PCMPGTW </c> instruction.
    +///
    +/// \param __m1
    +///    A 64-bit integer vector of [4 x i16].
    +/// \param __m2
    +///    A 64-bit integer vector of [4 x i16].
    +/// \returns A 64-bit integer vector of [4 x i16] containing the comparison
    +///    results.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
    +{
    +    return (__m64)__builtin_ia32_pcmpgtw((__v4hi)__m1, (__v4hi)__m2);
    +}
    +
    +/// Compares the 32-bit integer elements of two 64-bit integer vectors of
    +///    [2 x i32] to determine if the element of the first vector is 
greater than
    +///    the corresponding element of the second vector.
    +///
    +///    The comparison yields 0 for false, 0xFFFFFFFF for true.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PCMPGTD </c> instruction.
    +///
    +/// \param __m1
    +///    A 64-bit integer vector of [2 x i32].
    +/// \param __m2
    +///    A 64-bit integer vector of [2 x i32].
    +/// \returns A 64-bit integer vector of [2 x i32] containing the comparison
    +///    results.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
    +{
    +    return (__m64)__builtin_ia32_pcmpgtd((__v2si)__m1, (__v2si)__m2);
    +}
    +
    +/// Constructs a 64-bit integer vector initialized to zero.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PXOR </c> instruction.
    +///
    +/// \returns An initialized 64-bit integer vector with all elements set to 
zero.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_setzero_si64(void)
    +{
    +    return __extension__ (__m64){ 0LL };
    +}
    +
    +/// Constructs a 64-bit integer vector initialized with the specified
    +///    32-bit integer values.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic is a utility function and does not correspond to a 
specific
    +///    instruction.
    +///
    +/// \param __i1
    +///    A 32-bit integer value used to initialize the upper 32 bits of the
    +///    result.
    +/// \param __i0
    +///    A 32-bit integer value used to initialize the lower 32 bits of the
    +///    result.
    +/// \returns An initialized 64-bit integer vector.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_set_pi32(int __i1, int __i0)
    +{
    +    return (__m64)__builtin_ia32_vec_init_v2si(__i0, __i1);
    +}
    +
    +/// Constructs a 64-bit integer vector initialized with the specified
    +///    16-bit integer values.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic is a utility function and does not correspond to a 
specific
    +///    instruction.
    +///
    +/// \param __s3
    +///    A 16-bit integer value used to initialize bits [63:48] of the 
result.
    +/// \param __s2
    +///    A 16-bit integer value used to initialize bits [47:32] of the 
result.
    +/// \param __s1
    +///    A 16-bit integer value used to initialize bits [31:16] of the 
result.
    +/// \param __s0
    +///    A 16-bit integer value used to initialize bits [15:0] of the result.
    +/// \returns An initialized 64-bit integer vector.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_set_pi16(short __s3, short __s2, short __s1, short __s0)
    +{
    +    return (__m64)__builtin_ia32_vec_init_v4hi(__s0, __s1, __s2, __s3);
    +}
    +
    +/// Constructs a 64-bit integer vector initialized with the specified
    +///    8-bit integer values.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic is a utility function and does not correspond to a 
specific
    +///    instruction.
    +///
    +/// \param __b7
    +///    An 8-bit integer value used to initialize bits [63:56] of the 
result.
    +/// \param __b6
    +///    An 8-bit integer value used to initialize bits [55:48] of the 
result.
    +/// \param __b5
    +///    An 8-bit integer value used to initialize bits [47:40] of the 
result.
    +/// \param __b4
    +///    An 8-bit integer value used to initialize bits [39:32] of the 
result.
    +/// \param __b3
    +///    An 8-bit integer value used to initialize bits [31:24] of the 
result.
    +/// \param __b2
    +///    An 8-bit integer value used to initialize bits [23:16] of the 
result.
    +/// \param __b1
    +///    An 8-bit integer value used to initialize bits [15:8] of the result.
    +/// \param __b0
    +///    An 8-bit integer value used to initialize bits [7:0] of the result.
    +/// \returns An initialized 64-bit integer vector.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char 
__b2,
    +            char __b1, char __b0)
    +{
    +    return (__m64)__builtin_ia32_vec_init_v8qi(__b0, __b1, __b2, __b3,
    +                                               __b4, __b5, __b6, __b7);
    +}
    +
    +/// Constructs a 64-bit integer vector of [2 x i32], with each of the
    +///    32-bit integer vector elements set to the specified 32-bit integer
    +///    value.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic is a utility function and does not correspond to a 
specific
    +///    instruction.
    +///
    +/// \param __i
    +///    A 32-bit integer value used to initialize each vector element of the
    +///    result.
    +/// \returns An initialized 64-bit integer vector of [2 x i32].
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_set1_pi32(int __i)
    +{
    +    return _mm_set_pi32(__i, __i);
    +}
    +
    +/// Constructs a 64-bit integer vector of [4 x i16], with each of the
    +///    16-bit integer vector elements set to the specified 16-bit integer
    +///    value.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic is a utility function and does not correspond to a 
specific
    +///    instruction.
    +///
    +/// \param __w
    +///    A 16-bit integer value used to initialize each vector element of the
    +///    result.
    +/// \returns An initialized 64-bit integer vector of [4 x i16].
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_set1_pi16(short __w)
    +{
    +    return _mm_set_pi16(__w, __w, __w, __w);
    +}
    +
    +/// Constructs a 64-bit integer vector of [8 x i8], with each of the
    +///    8-bit integer vector elements set to the specified 8-bit integer 
value.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic is a utility function and does not correspond to a 
specific
    +///    instruction.
    +///
    +/// \param __b
    +///    An 8-bit integer value used to initialize each vector element of the
    +///    result.
    +/// \returns An initialized 64-bit integer vector of [8 x i8].
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_set1_pi8(char __b)
    +{
    +    return _mm_set_pi8(__b, __b, __b, __b, __b, __b, __b, __b);
    +}
    +
    +/// Constructs a 64-bit integer vector, initialized in reverse order with
    +///    the specified 32-bit integer values.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic is a utility function and does not correspond to a 
specific
    +///    instruction.
    +///
    +/// \param __i0
    +///    A 32-bit integer value used to initialize the lower 32 bits of the
    +///    result.
    +/// \param __i1
    +///    A 32-bit integer value used to initialize the upper 32 bits of the
    +///    result.
    +/// \returns An initialized 64-bit integer vector.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_setr_pi32(int __i0, int __i1)
    +{
    +    return _mm_set_pi32(__i1, __i0);
    +}
    +
    +/// Constructs a 64-bit integer vector, initialized in reverse order with
    +///    the specified 16-bit integer values.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic is a utility function and does not correspond to a 
specific
    +///    instruction.
    +///
    +/// \param __w0
    +///    A 16-bit integer value used to initialize bits [15:0] of the result.
    +/// \param __w1
    +///    A 16-bit integer value used to initialize bits [31:16] of the 
result.
    +/// \param __w2
    +///    A 16-bit integer value used to initialize bits [47:32] of the 
result.
    +/// \param __w3
    +///    A 16-bit integer value used to initialize bits [63:48] of the 
result.
    +/// \returns An initialized 64-bit integer vector.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_setr_pi16(short __w0, short __w1, short __w2, short __w3)
    +{
    +    return _mm_set_pi16(__w3, __w2, __w1, __w0);
    +}
    +
    +/// Constructs a 64-bit integer vector, initialized in reverse order with
    +///    the specified 8-bit integer values.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic is a utility function and does not correspond to a 
specific
    +///    instruction.
    +///
    +/// \param __b0
    +///    An 8-bit integer value used to initialize bits [7:0] of the result.
    +/// \param __b1
    +///    An 8-bit integer value used to initialize bits [15:8] of the result.
    +/// \param __b2
    +///    An 8-bit integer value used to initialize bits [23:16] of the 
result.
    +/// \param __b3
    +///    An 8-bit integer value used to initialize bits [31:24] of the 
result.
    +/// \param __b4
    +///    An 8-bit integer value used to initialize bits [39:32] of the 
result.
    +/// \param __b5
    +///    An 8-bit integer value used to initialize bits [47:40] of the 
result.
    +/// \param __b6
    +///    An 8-bit integer value used to initialize bits [55:48] of the 
result.
    +/// \param __b7
    +///    An 8-bit integer value used to initialize bits [63:56] of the 
result.
    +/// \returns An initialized 64-bit integer vector.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS
    +_mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4, char 
__b5,
    +             char __b6, char __b7)
    +{
    +    return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
    +}
    +
    +#undef __DEFAULT_FN_ATTRS
    +
    +/* Aliases for compatibility. */
    +#define _m_empty _mm_empty
    +#define _m_from_int _mm_cvtsi32_si64
    +#define _m_from_int64 _mm_cvtsi64_m64
    +#define _m_to_int _mm_cvtsi64_si32
    +#define _m_to_int64 _mm_cvtm64_si64
    +#define _m_packsswb _mm_packs_pi16
    +#define _m_packssdw _mm_packs_pi32
    +#define _m_packuswb _mm_packs_pu16
    +#define _m_punpckhbw _mm_unpackhi_pi8
    +#define _m_punpckhwd _mm_unpackhi_pi16
    +#define _m_punpckhdq _mm_unpackhi_pi32
    +#define _m_punpcklbw _mm_unpacklo_pi8
    +#define _m_punpcklwd _mm_unpacklo_pi16
    +#define _m_punpckldq _mm_unpacklo_pi32
    +#define _m_paddb _mm_add_pi8
    +#define _m_paddw _mm_add_pi16
    +#define _m_paddd _mm_add_pi32
    +#define _m_paddsb _mm_adds_pi8
    +#define _m_paddsw _mm_adds_pi16
    +#define _m_paddusb _mm_adds_pu8
    +#define _m_paddusw _mm_adds_pu16
    +#define _m_psubb _mm_sub_pi8
    +#define _m_psubw _mm_sub_pi16
    +#define _m_psubd _mm_sub_pi32
    +#define _m_psubsb _mm_subs_pi8
    +#define _m_psubsw _mm_subs_pi16
    +#define _m_psubusb _mm_subs_pu8
    +#define _m_psubusw _mm_subs_pu16
    +#define _m_pmaddwd _mm_madd_pi16
    +#define _m_pmulhw _mm_mulhi_pi16
    +#define _m_pmullw _mm_mullo_pi16
    +#define _m_psllw _mm_sll_pi16
    +#define _m_psllwi _mm_slli_pi16
    +#define _m_pslld _mm_sll_pi32
    +#define _m_pslldi _mm_slli_pi32
    +#define _m_psllq _mm_sll_si64
    +#define _m_psllqi _mm_slli_si64
    +#define _m_psraw _mm_sra_pi16
    +#define _m_psrawi _mm_srai_pi16
    +#define _m_psrad _mm_sra_pi32
    +#define _m_psradi _mm_srai_pi32
    +#define _m_psrlw _mm_srl_pi16
    +#define _m_psrlwi _mm_srli_pi16
    +#define _m_psrld _mm_srl_pi32
    +#define _m_psrldi _mm_srli_pi32
    +#define _m_psrlq _mm_srl_si64
    +#define _m_psrlqi _mm_srli_si64
    +#define _m_pand _mm_and_si64
    +#define _m_pandn _mm_andnot_si64
    +#define _m_por _mm_or_si64
    +#define _m_pxor _mm_xor_si64
    +#define _m_pcmpeqb _mm_cmpeq_pi8
    +#define _m_pcmpeqw _mm_cmpeq_pi16
    +#define _m_pcmpeqd _mm_cmpeq_pi32
    +#define _m_pcmpgtb _mm_cmpgt_pi8
    +#define _m_pcmpgtw _mm_cmpgt_pi16
    +#define _m_pcmpgtd _mm_cmpgt_pi32
    +
    +#endif /* __MMINTRIN_H */
    +
    diff --git a/include/nmmintrin.h b/include/nmmintrin.h
    new file mode 100644
    index 0000000..348fb8c
    --- /dev/null
    +++ b/include/nmmintrin.h
    @@ -0,0 +1,30 @@
    +/*===---- nmmintrin.h - SSE4 intrinsics 
------------------------------------===
    + *
    + * Permission is hereby granted, free of charge, to any person obtaining a 
copy
    + * of this software and associated documentation files (the "Software"), 
to deal
    + * in the Software without restriction, including without limitation the 
rights
    + * to use, copy, modify, merge, publish, distribute, sublicense, and/or 
sell
    + * copies of the Software, and to permit persons to whom the Software is
    + * furnished to do so, subject to the following conditions:
    + *
    + * The above copyright notice and this permission notice shall be included 
in
    + * all copies or substantial portions of the Software.
    + *
    + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 
OR
    + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
THE
    + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
    + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 
IN
    + * THE SOFTWARE.
    + *
    + 
*===-----------------------------------------------------------------------===
    + */
    +
    +#ifndef __NMMINTRIN_H
    +#define __NMMINTRIN_H
    +
    +/* To match expectations of gcc we put the sse4.2 definitions into 
smmintrin.h,
    +   just include it now then.  */
    +#include <smmintrin.h>
    +#endif /* __NMMINTRIN_H */
    diff --git a/include/pmmintrin.h b/include/pmmintrin.h
    new file mode 100644
    index 0000000..24b7d68
    --- /dev/null
    +++ b/include/pmmintrin.h
    @@ -0,0 +1,321 @@
    +/*===---- pmmintrin.h - SSE3 intrinsics 
------------------------------------===
    + *
    + * Permission is hereby granted, free of charge, to any person obtaining a 
copy
    + * of this software and associated documentation files (the "Software"), 
to deal
    + * in the Software without restriction, including without limitation the 
rights
    + * to use, copy, modify, merge, publish, distribute, sublicense, and/or 
sell
    + * copies of the Software, and to permit persons to whom the Software is
    + * furnished to do so, subject to the following conditions:
    + *
    + * The above copyright notice and this permission notice shall be included 
in
    + * all copies or substantial portions of the Software.
    + *
    + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 
OR
    + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
THE
    + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
    + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 
IN
    + * THE SOFTWARE.
    + *
    + 
*===-----------------------------------------------------------------------===
    + */
    +
    +#ifndef __PMMINTRIN_H
    +#define __PMMINTRIN_H
    +
    +#include <emmintrin.h>
    +
    +/* Define the default attributes for the functions in this file. */
    +#ifdef __GNUC__
    +#define __DEFAULT_FN_ATTRS \
    +  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    +#else
    +#define __DEFAULT_FN_ATTRS \
    +  __attribute__((__always_inline__, __nodebug__, __target__("sse3"), 
__min_vector_width__(128)))
    +#endif
    +
    +/// Loads data from an unaligned memory location to elements in a 128-bit
    +///    vector.
    +///
    +///    If the address of the data is not 16-byte aligned, the instruction 
may
    +///    read two adjacent aligned blocks of memory to retrieve the requested
    +///    data.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
    +///
    +/// \param __p
    +///    A pointer to a 128-bit integer vector containing integer values.
    +/// \returns A 128-bit vector containing the moved values.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_lddqu_si128(__m128i const *__p)
    +{
    +  return (__m128i)__builtin_ia32_lddqu((char const *)__p);
    +}
    +
    +/// Adds the even-indexed values and subtracts the odd-indexed values of
    +///    two 128-bit vectors of [4 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float] containing the left source operand.
    +/// \param __b
    +///    A 128-bit vector of [4 x float] containing the right source operand.
    +/// \returns A 128-bit vector of [4 x float] containing the alternating 
sums and
    +///    differences of both operands.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_addsub_ps(__m128 __a, __m128 __b)
    +{
    +  return __builtin_ia32_addsubps((__v4sf)__a, (__v4sf)__b);
    +}
    +
    +/// Horizontally adds the adjacent pairs of values contained in two
    +///    128-bit vectors of [4 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float] containing one of the source 
operands.
    +///    The horizontal sums of the values are stored in the lower bits of 
the
    +///    destination.
    +/// \param __b
    +///    A 128-bit vector of [4 x float] containing one of the source 
operands.
    +///    The horizontal sums of the values are stored in the upper bits of 
the
    +///    destination.
    +/// \returns A 128-bit vector of [4 x float] containing the horizontal 
sums of
    +///    both operands.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_hadd_ps(__m128 __a, __m128 __b)
    +{
    +  return __builtin_ia32_haddps((__v4sf)__a, (__v4sf)__b);
    +}
    +
    +/// Horizontally subtracts the adjacent pairs of values contained in two
    +///    128-bit vectors of [4 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float] containing one of the source 
operands.
    +///    The horizontal differences between the values are stored in the 
lower
    +///    bits of the destination.
    +/// \param __b
    +///    A 128-bit vector of [4 x float] containing one of the source 
operands.
    +///    The horizontal differences between the values are stored in the 
upper
    +///    bits of the destination.
    +/// \returns A 128-bit vector of [4 x float] containing the horizontal
    +///    differences of both operands.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_hsub_ps(__m128 __a, __m128 __b)
    +{
    +  return __builtin_ia32_hsubps((__v4sf)__a, (__v4sf)__b);
    +}
    +
    +/// Moves and duplicates odd-indexed values from a 128-bit vector
    +///    of [4 x float] to float values stored in a 128-bit vector of
    +///    [4 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float]. \n
    +///    Bits [127:96] of the source are written to bits [127:96] and 
[95:64] of
    +///    the destination. \n
    +///    Bits [63:32] of the source are written to bits [63:32] and [31:0] 
of the
    +///    destination.
    +/// \returns A 128-bit vector of [4 x float] containing the moved and 
duplicated
    +///    values.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_movehdup_ps(__m128 __a)
    +{
    +#ifdef __GNUC__
    +  return (__m128) __builtin_ia32_movshdup ((__v4sf)__a);
    +#else
    +  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 1, 1, 3, 3);
    +#endif
    +}
    +
    +/// Duplicates even-indexed values from a 128-bit vector of
    +///    [4 x float] to float values stored in a 128-bit vector of [4 x 
float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float] \n
    +///    Bits [95:64] of the source are written to bits [127:96] and [95:64] 
of
    +///    the destination. \n
    +///    Bits [31:0] of the source are written to bits [63:32] and [31:0] of 
the
    +///    destination.
    +/// \returns A 128-bit vector of [4 x float] containing the moved and 
duplicated
    +///    values.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_moveldup_ps(__m128 __a)
    +{
    +#ifdef __GNUC__
    +  return (__m128) __builtin_ia32_movsldup ((__v4sf)__a);
    +#else
    +  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 2, 2);
    +#endif
    +}
    +
    +/// Adds the even-indexed values and subtracts the odd-indexed values of
    +///    two 128-bit vectors of [2 x double].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double] containing the left source operand.
    +/// \param __b
    +///    A 128-bit vector of [2 x double] containing the right source 
operand.
    +/// \returns A 128-bit vector of [2 x double] containing the alternating 
sums
    +///    and differences of both operands.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_addsub_pd(__m128d __a, __m128d __b)
    +{
    +  return __builtin_ia32_addsubpd((__v2df)__a, (__v2df)__b);
    +}
    +
    +/// Horizontally adds the pairs of values contained in two 128-bit
    +///    vectors of [2 x double].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double] containing one of the source 
operands.
    +///    The horizontal sum of the values is stored in the lower bits of the
    +///    destination.
    +/// \param __b
    +///    A 128-bit vector of [2 x double] containing one of the source 
operands.
    +///    The horizontal sum of the values is stored in the upper bits of the
    +///    destination.
    +/// \returns A 128-bit vector of [2 x double] containing the horizontal 
sums of
    +///    both operands.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_hadd_pd(__m128d __a, __m128d __b)
    +{
    +  return __builtin_ia32_haddpd((__v2df)__a, (__v2df)__b);
    +}
    +
    +/// Horizontally subtracts the pairs of values contained in two 128-bit
    +///    vectors of [2 x double].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double] containing one of the source 
operands.
    +///    The horizontal difference of the values is stored in the lower bits 
of
    +///    the destination.
    +/// \param __b
    +///    A 128-bit vector of [2 x double] containing one of the source 
operands.
    +///    The horizontal difference of the values is stored in the upper bits 
of
    +///    the destination.
    +/// \returns A 128-bit vector of [2 x double] containing the horizontal
    +///    differences of both operands.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_hsub_pd(__m128d __a, __m128d __b)
    +{
    +  return __builtin_ia32_hsubpd((__v2df)__a, (__v2df)__b);
    +}
    +
    +/// Moves and duplicates one double-precision value to double-precision
    +///    values stored in a 128-bit vector of [2 x double].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m128d _mm_loaddup_pd(double const *dp);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
    +///
    +/// \param dp
    +///    A pointer to a double-precision value to be moved and duplicated.
    +/// \returns A 128-bit vector of [2 x double] containing the moved and
    +///    duplicated values.
    +#define        _mm_loaddup_pd(dp)        _mm_load1_pd(dp)
    +
    +/// Moves and duplicates the double-precision value in the lower bits of
    +///    a 128-bit vector of [2 x double] to double-precision values stored 
in a
    +///    128-bit vector of [2 x double].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [2 x double]. Bits [63:0] are written to bits
    +///    [127:64] and [63:0] of the destination.
    +/// \returns A 128-bit vector of [2 x double] containing the moved and
    +///    duplicated values.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_movedup_pd(__m128d __a)
    +{
    +#ifdef __GNUC__
    +  return _mm_shuffle_pd (__a, __a, _MM_SHUFFLE2 (0,0));
    +#else
    +  return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
    +#endif
    +}
    +
    +/// Establishes a linear address memory range to be monitored and puts
    +///    the processor in the monitor event pending state. Data stored in the
    +///    monitored address range causes the processor to exit the pending 
state.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> MONITOR </c> instruction.
    +///
    +/// \param __p
    +///    The memory range to be monitored. The size of the range is 
determined by
    +///    CPUID function 0000_0005h.
    +/// \param __extensions
    +///    Optional extensions for the monitoring state.
    +/// \param __hints
    +///    Optional hints for the monitoring state.
    +static __inline__ void __DEFAULT_FN_ATTRS
    +_mm_monitor(void const *__p, unsigned __extensions, unsigned __hints)
    +{
    +  __builtin_ia32_monitor((void *)__p, __extensions, __hints);
    +}
    +
    +/// Used with the MONITOR instruction to wait while the processor is in
    +///    the monitor event pending state. Data stored in the monitored 
address
    +///    range causes the processor to exit the pending state.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> MWAIT </c> instruction.
    +///
    +/// \param __extensions
    +///    Optional extensions for the monitoring state, which may vary by
    +///    processor.
    +/// \param __hints
    +///    Optional hints for the monitoring state, which may vary by 
processor.
    +static __inline__ void __DEFAULT_FN_ATTRS
    +_mm_mwait(unsigned __extensions, unsigned __hints)
    +{
    +  __builtin_ia32_mwait(__extensions, __hints);
    +}
    +
    +#undef __DEFAULT_FN_ATTRS
    +
    +#endif /* __PMMINTRIN_H */
    diff --git a/include/popcntintrin.h b/include/popcntintrin.h
    new file mode 100644
    index 0000000..bba0573
    --- /dev/null
    +++ b/include/popcntintrin.h
    @@ -0,0 +1,102 @@
    +/*===---- popcntintrin.h - POPCNT intrinsics 
-------------------------------===
    + *
    + * Permission is hereby granted, free of charge, to any person obtaining a 
copy
    + * of this software and associated documentation files (the "Software"), 
to deal
    + * in the Software without restriction, including without limitation the 
rights
    + * to use, copy, modify, merge, publish, distribute, sublicense, and/or 
sell
    + * copies of the Software, and to permit persons to whom the Software is
    + * furnished to do so, subject to the following conditions:
    + *
    + * The above copyright notice and this permission notice shall be included 
in
    + * all copies or substantial portions of the Software.
    + *
    + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 
OR
    + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
THE
    + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
    + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 
IN
    + * THE SOFTWARE.
    + *
    + 
*===-----------------------------------------------------------------------===
    + */
    +
    +#ifndef __POPCNTINTRIN_H
    +#define __POPCNTINTRIN_H
    +
    +/* Define the default attributes for the functions in this file. */
    +#ifdef __GNUC__
    +#define __DEFAULT_FN_ATTRS __attribute__((__gnu_inline__, 
__always_inline__, __artificial__))
    +#else
    +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, 
__target__("popcnt")))
    +#endif
    +
    +/// Counts the number of bits in the source operand having a value of 1.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> POPCNT </c> instruction.
    +///
    +/// \param __A
    +///    An unsigned 32-bit integer operand.
    +/// \returns A 32-bit integer containing the number of bits with value 1 
in the
    +///    source operand.
    +static __inline__ int __DEFAULT_FN_ATTRS
    +_mm_popcnt_u32(unsigned int __A)
    +{
    +  return __builtin_popcount(__A);
    +}
    +
    +/// Counts the number of bits in the source operand having a value of 1.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> POPCNT </c> instruction.
    +///
    +/// \param __A
    +///    A signed 32-bit integer operand.
    +/// \returns A 32-bit integer containing the number of bits with value 1 
in the
    +///    source operand.
    +static __inline__ int __DEFAULT_FN_ATTRS
    +_popcnt32(int __A)
    +{
    +  return __builtin_popcount(__A);
    +}
    +
    +#ifdef __x86_64__
    +/// Counts the number of bits in the source operand having a value of 1.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> POPCNT </c> instruction.
    +///
    +/// \param __A
    +///    An unsigned 64-bit integer operand.
    +/// \returns A 64-bit integer containing the number of bits with value 1 
in the
    +///    source operand.
    +static __inline__ long long __DEFAULT_FN_ATTRS
    +_mm_popcnt_u64(unsigned long long __A)
    +{
    +  return __builtin_popcountll(__A);
    +}
    +
    +/// Counts the number of bits in the source operand having a value of 1.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> POPCNT </c> instruction.
    +///
    +/// \param __A
    +///    A signed 64-bit integer operand.
    +/// \returns A 64-bit integer containing the number of bits with value 1 
in the
    +///    source operand.
    +static __inline__ long long __DEFAULT_FN_ATTRS
    +_popcnt64(long long __A)
    +{
    +  return __builtin_popcountll(__A);
    +}
    +#endif /* __x86_64__ */
    +
    +#undef __DEFAULT_FN_ATTRS
    +
    +#endif /* __POPCNTINTRIN_H */
    diff --git a/include/smmintrin.h b/include/smmintrin.h
    new file mode 100644
    index 0000000..4f1d637
    --- /dev/null
    +++ b/include/smmintrin.h
    @@ -0,0 +1,2504 @@
    +/*===---- smmintrin.h - SSE4 intrinsics 
------------------------------------===
    + *
    + * Permission is hereby granted, free of charge, to any person obtaining a 
copy
    + * of this software and associated documentation files (the "Software"), 
to deal
    + * in the Software without restriction, including without limitation the 
rights
    + * to use, copy, modify, merge, publish, distribute, sublicense, and/or 
sell
    + * copies of the Software, and to permit persons to whom the Software is
    + * furnished to do so, subject to the following conditions:
    + *
    + * The above copyright notice and this permission notice shall be included 
in
    + * all copies or substantial portions of the Software.
    + *
    + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 
OR
    + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
THE
    + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
    + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 
IN
    + * THE SOFTWARE.
    + *
    + 
*===-----------------------------------------------------------------------===
    + */
    +
    +#ifndef __SMMINTRIN_H
    +#define __SMMINTRIN_H
    +
    +#include <tmmintrin.h>
    +
    +/* Define the default attributes for the functions in this file. */
    +#ifdef __GNUC__
    +#define __DEFAULT_FN_ATTRS __attribute__((__gnu_inline__, 
__always_inline__, __artificial__))
    +#else
    +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, 
__target__("sse4.1"), __min_vector_width__(128)))
    +#endif
    +
    +/* SSE4 Rounding macros. */
    +#define _MM_FROUND_TO_NEAREST_INT    0x00
    +#define _MM_FROUND_TO_NEG_INF        0x01
    +#define _MM_FROUND_TO_POS_INF        0x02
    +#define _MM_FROUND_TO_ZERO           0x03
    +#define _MM_FROUND_CUR_DIRECTION     0x04
    +
    +#define _MM_FROUND_RAISE_EXC         0x00
    +#define _MM_FROUND_NO_EXC            0x08
    +
    +#define _MM_FROUND_NINT      (_MM_FROUND_RAISE_EXC | 
_MM_FROUND_TO_NEAREST_INT)
    +#define _MM_FROUND_FLOOR     (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF)
    +#define _MM_FROUND_CEIL      (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF)
    +#define _MM_FROUND_TRUNC     (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO)
    +#define _MM_FROUND_RINT      (_MM_FROUND_RAISE_EXC | 
_MM_FROUND_CUR_DIRECTION)
    +#define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION)
    +
    +/// Rounds up each element of the 128-bit vector of [4 x float] to an
    +///    integer and returns the rounded values in a 128-bit vector of
    +///    [4 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m128 _mm_ceil_ps(__m128 X);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> 
instruction.
    +///
    +/// \param X
    +///    A 128-bit vector of [4 x float] values to be rounded up.
    +/// \returns A 128-bit vector of [4 x float] containing the rounded values.
    +#define _mm_ceil_ps(X)       _mm_round_ps((X), _MM_FROUND_CEIL)
    +
    +/// Rounds up each element of the 128-bit vector of [2 x double] to an
    +///    integer and returns the rounded values in a 128-bit vector of
    +///    [2 x double].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m128d _mm_ceil_pd(__m128d X);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> 
instruction.
    +///
    +/// \param X
    +///    A 128-bit vector of [2 x double] values to be rounded up.
    +/// \returns A 128-bit vector of [2 x double] containing the rounded 
values.
    +#define _mm_ceil_pd(X)       _mm_round_pd((X), _MM_FROUND_CEIL)
    +
    +/// Copies three upper elements of the first 128-bit vector operand to
    +///    the corresponding three upper elements of the 128-bit result vector 
of
    +///    [4 x float]. Rounds up the lowest element of the second 128-bit 
vector
    +///    operand to an integer and copies it to the lowest element of the 
128-bit
    +///    result vector of [4 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m128 _mm_ceil_ss(__m128 X, __m128 Y);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> 
instruction.
    +///
    +/// \param X
    +///    A 128-bit vector of [4 x float]. The values stored in bits [127:32] 
are
    +///    copied to the corresponding bits of the result.
    +/// \param Y
    +///    A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
    +///    rounded up to the nearest integer and copied to the corresponding 
bits
    +///    of the result.
    +/// \returns A 128-bit vector of [4 x float] containing the copied and 
rounded
    +///    values.
    +#define _mm_ceil_ss(X, Y)    _mm_round_ss((X), (Y), _MM_FROUND_CEIL)
    +
    +/// Copies the upper element of the first 128-bit vector operand to the
    +///    corresponding upper element of the 128-bit result vector of [2 x 
double].
    +///    Rounds up the lower element of the second 128-bit vector operand to 
an
    +///    integer and copies it to the lower element of the 128-bit result 
vector
    +///    of [2 x double].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m128d _mm_ceil_sd(__m128d X, __m128d Y);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> 
instruction.
    +///
    +/// \param X
    +///    A 128-bit vector of [2 x double]. The value stored in bits [127:64] 
is
    +///    copied to the corresponding bits of the result.
    +/// \param Y
    +///    A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
    +///    rounded up to the nearest integer and copied to the corresponding 
bits
    +///    of the result.
    +/// \returns A 128-bit vector of [2 x double] containing the copied and 
rounded
    +///    values.
    +#define _mm_ceil_sd(X, Y)    _mm_round_sd((X), (Y), _MM_FROUND_CEIL)
    +
    +/// Rounds down each element of the 128-bit vector of [4 x float] to an
    +///    an integer and returns the rounded values in a 128-bit vector of
    +///    [4 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m128 _mm_floor_ps(__m128 X);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> 
instruction.
    +///
    +/// \param X
    +///    A 128-bit vector of [4 x float] values to be rounded down.
    +/// \returns A 128-bit vector of [4 x float] containing the rounded values.
    +#define _mm_floor_ps(X)      _mm_round_ps((X), _MM_FROUND_FLOOR)
    +
    +/// Rounds down each element of the 128-bit vector of [2 x double] to an
    +///    integer and returns the rounded values in a 128-bit vector of
    +///    [2 x double].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m128d _mm_floor_pd(__m128d X);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> 
instruction.
    +///
    +/// \param X
    +///    A 128-bit vector of [2 x double].
    +/// \returns A 128-bit vector of [2 x double] containing the rounded 
values.
    +#define _mm_floor_pd(X)      _mm_round_pd((X), _MM_FROUND_FLOOR)
    +
    +/// Copies three upper elements of the first 128-bit vector operand to
    +///    the corresponding three upper elements of the 128-bit result vector 
of
    +///    [4 x float]. Rounds down the lowest element of the second 128-bit 
vector
    +///    operand to an integer and copies it to the lowest element of the 
128-bit
    +///    result vector of [4 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m128 _mm_floor_ss(__m128 X, __m128 Y);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> 
instruction.
    +///
    +/// \param X
    +///    A 128-bit vector of [4 x float]. The values stored in bits [127:32] 
are
    +///    copied to the corresponding bits of the result.
    +/// \param Y
    +///    A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
    +///    rounded down to the nearest integer and copied to the corresponding 
bits
    +///    of the result.
    +/// \returns A 128-bit vector of [4 x float] containing the copied and 
rounded
    +///    values.
    +#define _mm_floor_ss(X, Y)   _mm_round_ss((X), (Y), _MM_FROUND_FLOOR)
    +
    +/// Copies the upper element of the first 128-bit vector operand to the
    +///    corresponding upper element of the 128-bit result vector of [2 x 
double].
    +///    Rounds down the lower element of the second 128-bit vector operand 
to an
    +///    integer and copies it to the lower element of the 128-bit result 
vector
    +///    of [2 x double].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m128d _mm_floor_sd(__m128d X, __m128d Y);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> 
instruction.
    +///
    +/// \param X
    +///    A 128-bit vector of [2 x double]. The value stored in bits [127:64] 
is
    +///    copied to the corresponding bits of the result.
    +/// \param Y
    +///    A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
    +///    rounded down to the nearest integer and copied to the corresponding 
bits
    +///    of the result.
    +/// \returns A 128-bit vector of [2 x double] containing the copied and 
rounded
    +///    values.
    +#define _mm_floor_sd(X, Y)   _mm_round_sd((X), (Y), _MM_FROUND_FLOOR)
    +
    +/// Rounds each element of the 128-bit vector of [4 x float] to an
    +///    integer value according to the rounding control specified by the 
second
    +///    argument and returns the rounded values in a 128-bit vector of
    +///    [4 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m128 _mm_round_ps(__m128 X, const int M);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> 
instruction.
    +///
    +/// \param X
    +///    A 128-bit vector of [4 x float].
    +/// \param M
    +///    An integer value that specifies the rounding operation. \n
    +///    Bits [7:4] are reserved. \n
    +///    Bit [3] is a precision exception value: \n
    +///      0: A normal PE exception is used \n
    +///      1: The PE field is not updated \n
    +///    Bit [2] is the rounding control source: \n
    +///      0: Use bits [1:0] of \a M \n
    +///      1: Use the current MXCSR setting \n
    +///    Bits [1:0] contain the rounding control definition: \n
    +///      00: Nearest \n
    +///      01: Downward (toward negative infinity) \n
    +///      10: Upward (toward positive infinity) \n
    +///      11: Truncated
    +/// \returns A 128-bit vector of [4 x float] containing the rounded values.
    +#define _mm_round_ps(X, M) \
    +  (__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M))
    +
    +/// Copies three upper elements of the first 128-bit vector operand to
    +///    the corresponding three upper elements of the 128-bit result vector 
of
    +///    [4 x float]. Rounds the lowest element of the second 128-bit vector
    +///    operand to an integer value according to the rounding control 
specified
    +///    by the third argument and copies it to the lowest element of the 
128-bit
    +///    result vector of [4 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m128 _mm_round_ss(__m128 X, __m128 Y, const int M);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> 
instruction.
    +///
    +/// \param X
    +///    A 128-bit vector of [4 x float]. The values stored in bits [127:32] 
are
    +///    copied to the corresponding bits of the result.
    +/// \param Y
    +///    A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
    +///    rounded to the nearest integer using the specified rounding control 
and
    +///    copied to the corresponding bits of the result.
    +/// \param M
    +///    An integer value that specifies the rounding operation. \n
    +///    Bits [7:4] are reserved. \n
    +///    Bit [3] is a precision exception value: \n
    +///      0: A normal PE exception is used \n
    +///      1: The PE field is not updated \n
    +///    Bit [2] is the rounding control source: \n
    +///      0: Use bits [1:0] of \a M \n
    +///      1: Use the current MXCSR setting \n
    +///    Bits [1:0] contain the rounding control definition: \n
    +///      00: Nearest \n
    +///      01: Downward (toward negative infinity) \n
    +///      10: Upward (toward positive infinity) \n
    +///      11: Truncated
    +/// \returns A 128-bit vector of [4 x float] containing the copied and 
rounded
    +///    values.
    +#define _mm_round_ss(X, Y, M) \
    +  (__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \
    +                                 (__v4sf)(__m128)(Y), (M))
    +
    +/// Rounds each element of the 128-bit vector of [2 x double] to an
    +///    integer value according to the rounding control specified by the 
second
    +///    argument and returns the rounded values in a 128-bit vector of
    +///    [2 x double].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m128d _mm_round_pd(__m128d X, const int M);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> 
instruction.
    +///
    +/// \param X
    +///    A 128-bit vector of [2 x double].
    +/// \param M
    +///    An integer value that specifies the rounding operation. \n
    +///    Bits [7:4] are reserved. \n
    +///    Bit [3] is a precision exception value: \n
    +///      0: A normal PE exception is used \n
    +///      1: The PE field is not updated \n
    +///    Bit [2] is the rounding control source: \n
    +///      0: Use bits [1:0] of \a M \n
    +///      1: Use the current MXCSR setting \n
    +///    Bits [1:0] contain the rounding control definition: \n
    +///      00: Nearest \n
    +///      01: Downward (toward negative infinity) \n
    +///      10: Upward (toward positive infinity) \n
    +///      11: Truncated
    +/// \returns A 128-bit vector of [2 x double] containing the rounded 
values.
    +#define _mm_round_pd(X, M) \
    +  (__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M))
    +
    +/// Copies the upper element of the first 128-bit vector operand to the
    +///    corresponding upper element of the 128-bit result vector of [2 x 
double].
    +///    Rounds the lower element of the second 128-bit vector operand to an
    +///    integer value according to the rounding control specified by the 
third
    +///    argument and copies it to the lower element of the 128-bit result 
vector
    +///    of [2 x double].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m128d _mm_round_sd(__m128d X, __m128d Y, const int M);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> 
instruction.
    +///
    +/// \param X
    +///    A 128-bit vector of [2 x double]. The value stored in bits [127:64] 
is
    +///    copied to the corresponding bits of the result.
    +/// \param Y
    +///    A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
    +///    rounded to the nearest integer using the specified rounding control 
and
    +///    copied to the corresponding bits of the result.
    +/// \param M
    +///    An integer value that specifies the rounding operation. \n
    +///    Bits [7:4] are reserved. \n
    +///    Bit [3] is a precision exception value: \n
    +///      0: A normal PE exception is used \n
    +///      1: The PE field is not updated \n
    +///    Bit [2] is the rounding control source: \n
    +///      0: Use bits [1:0] of \a M \n
    +///      1: Use the current MXCSR setting \n
    +///    Bits [1:0] contain the rounding control definition: \n
    +///      00: Nearest \n
    +///      01: Downward (toward negative infinity) \n
    +///      10: Upward (toward positive infinity) \n
    +///      11: Truncated
    +/// \returns A 128-bit vector of [2 x double] containing the copied and 
rounded
    +///    values.
    +#define _mm_round_sd(X, Y, M) \
    +  (__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \
    +                                  (__v2df)(__m128d)(Y), (M))
    +
    +/* SSE4 Packed Blending Intrinsics.  */
    +/// Returns a 128-bit vector of [2 x double] where the values are
    +///    selected from either the first or second operand as specified by the
    +///    third operand, the control mask.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m128d _mm_blend_pd(__m128d V1, __m128d V2, const int M);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> 
instruction.
    +///
    +/// \param V1
    +///    A 128-bit vector of [2 x double].
    +/// \param V2
    +///    A 128-bit vector of [2 x double].
    +/// \param M
    +///    An immediate integer operand, with mask bits [1:0] specifying how 
the
    +///    values are to be copied. The position of the mask bit corresponds 
to the
    +///    index of a copied value. When a mask bit is 0, the corresponding 
64-bit
    +///    element in operand \a V1 is copied to the same position in the 
result.
    +///    When a mask bit is 1, the corresponding 64-bit element in operand 
\a V2
    +///    is copied to the same position in the result.
    +/// \returns A 128-bit vector of [2 x double] containing the copied values.
    +#define _mm_blend_pd(V1, V2, M) \
    +  (__m128d) __builtin_ia32_blendpd ((__v2df)(__m128d)(V1), \
    +                                    (__v2df)(__m128d)(V2), (int)(M))
    +
    +/// Returns a 128-bit vector of [4 x float] where the values are selected
    +///    from either the first or second operand as specified by the third
    +///    operand, the control mask.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m128 _mm_blend_ps(__m128 V1, __m128 V2, const int M);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS </c> 
instruction.
    +///
    +/// \param V1
    +///    A 128-bit vector of [4 x float].
    +/// \param V2
    +///    A 128-bit vector of [4 x float].
    +/// \param M
    +///    An immediate integer operand, with mask bits [3:0] specifying how 
the
    +///    values are to be copied. The position of the mask bit corresponds 
to the
    +///    index of a copied value. When a mask bit is 0, the corresponding 
32-bit
    +///    element in operand \a V1 is copied to the same position in the 
result.
    +///    When a mask bit is 1, the corresponding 32-bit element in operand 
\a V2
    +///    is copied to the same position in the result.
    +/// \returns A 128-bit vector of [4 x float] containing the copied values.
    +#define _mm_blend_ps(V1, V2, M) \
    +  (__m128) __builtin_ia32_blendps ((__v4sf)(__m128)(V1), \
    +                                   (__v4sf)(__m128)(V2), (int)(M))
    +
    +/// Returns a 128-bit vector of [2 x double] where the values are
    +///    selected from either the first or second operand as specified by the
    +///    third operand, the control mask.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VBLENDVPD / BLENDVPD </c> 
instruction.
    +///
    +/// \param __V1
    +///    A 128-bit vector of [2 x double].
    +/// \param __V2
    +///    A 128-bit vector of [2 x double].
    +/// \param __M
    +///    A 128-bit vector operand, with mask bits 127 and 63 specifying how 
the
    +///    values are to be copied. The position of the mask bit corresponds 
to the
    +///    most significant bit of a copied value. When a mask bit is 0, the
    +///    corresponding 64-bit element in operand \a __V1 is copied to the 
same
    +///    position in the result. When a mask bit is 1, the corresponding 
64-bit
    +///    element in operand \a __V2 is copied to the same position in the 
result.
    +/// \returns A 128-bit vector of [2 x double] containing the copied values.
    +static __inline__ __m128d __DEFAULT_FN_ATTRS
    +_mm_blendv_pd (__m128d __V1, __m128d __V2, __m128d __M)
    +{
    +  return (__m128d) __builtin_ia32_blendvpd ((__v2df)__V1, (__v2df)__V2,
    +                                            (__v2df)__M);
    +}
    +
    +/// Returns a 128-bit vector of [4 x float] where the values are
    +///    selected from either the first or second operand as specified by the
    +///    third operand, the control mask.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VBLENDVPS / BLENDVPS </c> 
instruction.
    +///
    +/// \param __V1
    +///    A 128-bit vector of [4 x float].
    +/// \param __V2
    +///    A 128-bit vector of [4 x float].
    +/// \param __M
    +///    A 128-bit vector operand, with mask bits 127, 95, 63, and 31 
specifying
    +///    how the values are to be copied. The position of the mask bit 
corresponds
    +///    to the most significant bit of a copied value. When a mask bit is 
0, the
    +///    corresponding 32-bit element in operand \a __V1 is copied to the 
same
    +///    position in the result. When a mask bit is 1, the corresponding 
32-bit
    +///    element in operand \a __V2 is copied to the same position in the 
result.
    +/// \returns A 128-bit vector of [4 x float] containing the copied values.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M)
    +{
    +  return (__m128) __builtin_ia32_blendvps ((__v4sf)__V1, (__v4sf)__V2,
    +                                           (__v4sf)__M);
    +}
    +
    +/// Returns a 128-bit vector of [16 x i8] where the values are selected
    +///    from either of the first or second operand as specified by the third
    +///    operand, the control mask.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPBLENDVB / PBLENDVB </c> 
instruction.
    +///
    +/// \param __V1
    +///    A 128-bit vector of [16 x i8].
    +/// \param __V2
    +///    A 128-bit vector of [16 x i8].
    +/// \param __M
    +///    A 128-bit vector operand, with mask bits 127, 119, 111...7 
specifying
    +///    how the values are to be copied. The position of the mask bit 
corresponds
    +///    to the most significant bit of a copied value. When a mask bit is 
0, the
    +///    corresponding 8-bit element in operand \a __V1 is copied to the same
    +///    position in the result. When a mask bit is 1, the corresponding 
8-bit
    +///    element in operand \a __V2 is copied to the same position in the 
result.
    +/// \returns A 128-bit vector of [16 x i8] containing the copied values.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)
    +{
    +  return (__m128i) __builtin_ia32_pblendvb128 ((__v16qi)__V1, 
(__v16qi)__V2,
    +                                               (__v16qi)__M);
    +}
    +
    +/// Returns a 128-bit vector of [8 x i16] where the values are selected
    +///    from either of the first or second operand as specified by the third
    +///    operand, the control mask.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m128i _mm_blend_epi16(__m128i V1, __m128i V2, const int M);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VPBLENDW / PBLENDW </c> 
instruction.
    +///
    +/// \param V1
    +///    A 128-bit vector of [8 x i16].
    +/// \param V2
    +///    A 128-bit vector of [8 x i16].
    +/// \param M
    +///    An immediate integer operand, with mask bits [7:0] specifying how 
the
    +///    values are to be copied. The position of the mask bit corresponds 
to the
    +///    index of a copied value. When a mask bit is 0, the corresponding 
16-bit
    +///    element in operand \a V1 is copied to the same position in the 
result.
    +///    When a mask bit is 1, the corresponding 16-bit element in operand 
\a V2
    +///    is copied to the same position in the result.
    +/// \returns A 128-bit vector of [8 x i16] containing the copied values.
    +#define _mm_blend_epi16(V1, V2, M) \
    +  (__m128i) __builtin_ia32_pblendw128 ((__v8hi)(__m128i)(V1), \
    +                                       (__v8hi)(__m128i)(V2), (int)(M))
    +
    +/* SSE4 Dword Multiply Instructions.  */
    +/// Multiples corresponding elements of two 128-bit vectors of [4 x i32]
    +///    and returns the lower 32 bits of the each product in a 128-bit 
vector of
    +///    [4 x i32].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPMULLD / PMULLD </c> 
instruction.
    +///
    +/// \param __V1
    +///    A 128-bit integer vector.
    +/// \param __V2
    +///    A 128-bit integer vector.
    +/// \returns A 128-bit integer vector containing the products of both 
operands.
    +static __inline__  __m128i __DEFAULT_FN_ATTRS
    +_mm_mullo_epi32 (__m128i __V1, __m128i __V2)
    +{
    +  return (__m128i) ((__v4su)__V1 * (__v4su)__V2);
    +}
    +
    +/// Multiplies corresponding even-indexed elements of two 128-bit
    +///    vectors of [4 x i32] and returns a 128-bit vector of [2 x i64]
    +///    containing the products.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPMULDQ / PMULDQ </c> 
instruction.
    +///
    +/// \param __V1
    +///    A 128-bit vector of [4 x i32].
    +/// \param __V2
    +///    A 128-bit vector of [4 x i32].
    +/// \returns A 128-bit vector of [2 x i64] containing the products of both
    +///    operands.
    +static __inline__  __m128i __DEFAULT_FN_ATTRS
    +_mm_mul_epi32 (__m128i __V1, __m128i __V2)
    +{
    +  return (__m128i) __builtin_ia32_pmuldq128 ((__v4si)__V1, (__v4si)__V2);
    +}
    +
    +/* SSE4 Floating Point Dot Product Instructions.  */
    +/// Computes the dot product of the two 128-bit vectors of [4 x float]
    +///    and returns it in the elements of the 128-bit result vector of
    +///    [4 x float].
    +///
    +///    The immediate integer operand controls which input elements
    +///    will contribute to the dot product, and where the final results are
    +///    returned.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m128 _mm_dp_ps(__m128 X, __m128 Y, const int M);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VDPPS / DPPS </c> instruction.
    +///
    +/// \param X
    +///    A 128-bit vector of [4 x float].
    +/// \param Y
    +///    A 128-bit vector of [4 x float].
    +/// \param M
    +///    An immediate integer operand. Mask bits [7:4] determine which 
elements
    +///    of the input vectors are used, with bit [4] corresponding to the 
lowest
    +///    element and bit [7] corresponding to the highest element of each [4 
x
    +///    float] vector. If a bit is set, the corresponding elements from the 
two
    +///    input vectors are used as an input for dot product; otherwise that 
input
    +///    is treated as zero. Bits [3:0] determine which elements of the 
result
    +///    will receive a copy of the final dot product, with bit [0] 
corresponding
    +///    to the lowest element and bit [3] corresponding to the highest 
element of
    +///    each [4 x float] subvector. If a bit is set, the dot product is 
returned
    +///    in the corresponding element; otherwise that element is set to zero.
    +/// \returns A 128-bit vector of [4 x float] containing the dot product.
    +#define _mm_dp_ps(X, Y, M) \
    +  (__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \
    +                               (__v4sf)(__m128)(Y), (M))
    +
    +/// Computes the dot product of the two 128-bit vectors of [2 x double]
    +///    and returns it in the elements of the 128-bit result vector of
    +///    [2 x double].
    +///
    +///    The immediate integer operand controls which input
    +///    elements will contribute to the dot product, and where the final 
results
    +///    are returned.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m128d _mm_dp_pd(__m128d X, __m128d Y, const int M);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VDPPD / DPPD </c> instruction.
    +///
    +/// \param X
    +///    A 128-bit vector of [2 x double].
    +/// \param Y
    +///    A 128-bit vector of [2 x double].
    +/// \param M
    +///    An immediate integer operand. Mask bits [5:4] determine which 
elements
    +///    of the input vectors are used, with bit [4] corresponding to the 
lowest
    +///    element and bit [5] corresponding to the highest element of each of 
[2 x
    +///    double] vector. If a bit is set, the corresponding elements from 
the two
    +///    input vectors are used as an input for dot product; otherwise that 
input
    +///    is treated as zero. Bits [1:0] determine which elements of the 
result
    +///    will receive a copy of the final dot product, with bit [0] 
corresponding
    +///    to the lowest element and bit [1] corresponding to the highest 
element of
    +///    each [2 x double] vector. If a bit is set, the dot product is 
returned in
    +///    the corresponding element; otherwise that element is set to zero.
    +#define _mm_dp_pd(X, Y, M) \
    +  (__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \
    +                                (__v2df)(__m128d)(Y), (M))
    +
    +/* SSE4 Streaming Load Hint Instruction.  */
    +/// Loads integer values from a 128-bit aligned memory location to a
    +///    128-bit integer vector.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVNTDQA / MOVNTDQA </c> 
instruction.
    +///
    +/// \param __V
    +///    A pointer to a 128-bit aligned memory location that contains the 
integer
    +///    values.
    +/// \returns A 128-bit integer vector containing the data stored at the
    +///    specified memory location.
    +static __inline__  __m128i __DEFAULT_FN_ATTRS
    +_mm_stream_load_si128 (__m128i const *__V)
    +{
    +#ifdef __GNUC__
    +  return (__m128i) __builtin_ia32_movntdqa ((__v2di *) __V);
    +#else
    +  return (__m128i) __builtin_nontemporal_load ((const __v2di *) __V);
    +#endif
    +}
    +
    +/* SSE4 Packed Integer Min/Max Instructions.  */
    +/// Compares the corresponding elements of two 128-bit vectors of
    +///    [16 x i8] and returns a 128-bit vector of [16 x i8] containing the 
lesser
    +///    of the two values.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPMINSB / PMINSB </c> 
instruction.
    +///
    +/// \param __V1
    +///    A 128-bit vector of [16 x i8].
    +/// \param __V2
    +///    A 128-bit vector of [16 x i8]
    +/// \returns A 128-bit vector of [16 x i8] containing the lesser values.
    +static __inline__  __m128i __DEFAULT_FN_ATTRS
    +_mm_min_epi8 (__m128i __V1, __m128i __V2)
    +{
    +  return (__m128i) __builtin_ia32_pminsb128 ((__v16qi) __V1, (__v16qi) 
__V2);
    +}
    +
    +/// Compares the corresponding elements of two 128-bit vectors of
    +///    [16 x i8] and returns a 128-bit vector of [16 x i8] containing the
    +///    greater value of the two.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPMAXSB / PMAXSB </c> 
instruction.
    +///
    +/// \param __V1
    +///    A 128-bit vector of [16 x i8].
    +/// \param __V2
    +///    A 128-bit vector of [16 x i8].
    +/// \returns A 128-bit vector of [16 x i8] containing the greater values.
    +static __inline__  __m128i __DEFAULT_FN_ATTRS
    +_mm_max_epi8 (__m128i __V1, __m128i __V2)
    +{
    +  return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi) __V1, (__v16qi) 
__V2);
    +}
    +
    +/// Compares the corresponding elements of two 128-bit vectors of
    +///    [8 x u16] and returns a 128-bit vector of [8 x u16] containing the 
lesser
    +///    value of the two.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPMINUW / PMINUW </c> 
instruction.
    +///
    +/// \param __V1
    +///    A 128-bit vector of [8 x u16].
    +/// \param __V2
    +///    A 128-bit vector of [8 x u16].
    +/// \returns A 128-bit vector of [8 x u16] containing the lesser values.
    +static __inline__  __m128i __DEFAULT_FN_ATTRS
    +_mm_min_epu16 (__m128i __V1, __m128i __V2)
    +{
    +  return (__m128i) __builtin_ia32_pminuw128 ((__v8hi) __V1, (__v8hi) __V2);
    +}
    +
    +/// Compares the corresponding elements of two 128-bit vectors of
    +///    [8 x u16] and returns a 128-bit vector of [8 x u16] containing the
    +///    greater value of the two.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPMAXUW / PMAXUW </c> 
instruction.
    +///
    +/// \param __V1
    +///    A 128-bit vector of [8 x u16].
    +/// \param __V2
    +///    A 128-bit vector of [8 x u16].
    +/// \returns A 128-bit vector of [8 x u16] containing the greater values.
    +static __inline__  __m128i __DEFAULT_FN_ATTRS
    +_mm_max_epu16 (__m128i __V1, __m128i __V2)
    +{
    +  return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi) __V1, (__v8hi) __V2);
    +}
    +
    +/// Compares the corresponding elements of two 128-bit vectors of
    +///    [4 x i32] and returns a 128-bit vector of [4 x i32] containing the 
lesser
    +///    value of the two.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPMINSD / PMINSD </c> 
instruction.
    +///
    +/// \param __V1
    +///    A 128-bit vector of [4 x i32].
    +/// \param __V2
    +///    A 128-bit vector of [4 x i32].
    +/// \returns A 128-bit vector of [4 x i32] containing the lesser values.
    +static __inline__  __m128i __DEFAULT_FN_ATTRS
    +_mm_min_epi32 (__m128i __V1, __m128i __V2)
    +{
    +  return (__m128i) __builtin_ia32_pminsd128 ((__v4si) __V1, (__v4si) __V2);
    +}
    +
    +/// Compares the corresponding elements of two 128-bit vectors of
    +///    [4 x i32] and returns a 128-bit vector of [4 x i32] containing the
    +///    greater value of the two.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPMAXSD / PMAXSD </c> 
instruction.
    +///
    +/// \param __V1
    +///    A 128-bit vector of [4 x i32].
    +/// \param __V2
    +///    A 128-bit vector of [4 x i32].
    +/// \returns A 128-bit vector of [4 x i32] containing the greater values.
    +static __inline__  __m128i __DEFAULT_FN_ATTRS
    +_mm_max_epi32 (__m128i __V1, __m128i __V2)
    +{
    +  return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si) __V1, (__v4si) __V2);
    +}
    +
    +/// Compares the corresponding elements of two 128-bit vectors of
    +///    [4 x u32] and returns a 128-bit vector of [4 x u32] containing the 
lesser
    +///    value of the two.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPMINUD / PMINUD </c>  
instruction.
    +///
    +/// \param __V1
    +///    A 128-bit vector of [4 x u32].
    +/// \param __V2
    +///    A 128-bit vector of [4 x u32].
    +/// \returns A 128-bit vector of [4 x u32] containing the lesser values.
    +static __inline__  __m128i __DEFAULT_FN_ATTRS
    +_mm_min_epu32 (__m128i __V1, __m128i __V2)
    +{
    +  return (__m128i) __builtin_ia32_pminud128((__v4si) __V1, (__v4si) __V2);
    +}
    +
    +/// Compares the corresponding elements of two 128-bit vectors of
    +///    [4 x u32] and returns a 128-bit vector of [4 x u32] containing the
    +///    greater value of the two.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPMAXUD / PMAXUD </c> 
instruction.
    +///
    +/// \param __V1
    +///    A 128-bit vector of [4 x u32].
    +/// \param __V2
    +///    A 128-bit vector of [4 x u32].
    +/// \returns A 128-bit vector of [4 x u32] containing the greater values.
    +static __inline__  __m128i __DEFAULT_FN_ATTRS
    +_mm_max_epu32 (__m128i __V1, __m128i __V2)
    +{
    +  return (__m128i) __builtin_ia32_pmaxud128((__v4si) __V1, (__v4si) __V2);
    +}
    +
    +/* SSE4 Insertion and Extraction from XMM Register Instructions.  */
    +/// Takes the first argument \a X and inserts an element from the second
    +///    argument \a Y as selected by the third argument \a N. That result 
then
    +///    has elements zeroed out also as selected by the third argument \a 
N. The
    +///    resulting 128-bit vector of [4 x float] is then returned.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m128 _mm_insert_ps(__m128 X, __m128 Y, const int N);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VINSERTPS </c> instruction.
    +///
    +/// \param X
    +///    A 128-bit vector source operand of [4 x float]. With the exception 
of
    +///    those bits in the result copied from parameter \a Y and zeroed by 
bits
    +///    [3:0] of \a N, all bits from this parameter are copied to the 
result.
    +/// \param Y
    +///    A 128-bit vector source operand of [4 x float]. One single-precision
    +///    floating-point element from this source, as determined by the 
immediate
    +///    parameter, is copied to the result.
    +/// \param N
    +///    Specifies which bits from operand \a Y will be copied, which bits 
in the
    +///    result they will be be copied to, and which bits in the result will 
be
    +///    cleared. The following assignments are made: \n
    +///    Bits [7:6] specify the bits to copy from operand \a Y: \n
    +///      00: Selects bits [31:0] from operand \a Y. \n
    +///      01: Selects bits [63:32] from operand \a Y. \n
    +///      10: Selects bits [95:64] from operand \a Y. \n
    +///      11: Selects bits [127:96] from operand \a Y. \n
    +///    Bits [5:4] specify the bits in the result to which the selected bits
    +///    from operand \a Y are copied: \n
    +///      00: Copies the selected bits from \a Y to result bits [31:0]. \n
    +///      01: Copies the selected bits from \a Y to result bits [63:32]. \n
    +///      10: Copies the selected bits from \a Y to result bits [95:64]. \n
    +///      11: Copies the selected bits from \a Y to result bits [127:96]. \n
    +///    Bits[3:0]: If any of these bits are set, the corresponding result
    +///    element is cleared.
    +/// \returns A 128-bit vector of [4 x float] containing the copied
    +///    single-precision floating point elements from the operands.
    +#define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N))
    +
    +/// Extracts a 32-bit integer from a 128-bit vector of [4 x float] and
    +///    returns it, using the immediate value parameter \a N as a selector.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// int _mm_extract_ps(__m128 X, const int N);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VEXTRACTPS / EXTRACTPS </c>
    +/// instruction.
    +///
    +/// \param X
    +///    A 128-bit vector of [4 x float].
    +/// \param N
    +///    An immediate value. Bits [1:0] determines which bits from the 
argument
    +///    \a X are extracted and returned: \n
    +///    00: Bits [31:0] of parameter \a X are returned. \n
    +///    01: Bits [63:32] of parameter \a X are returned. \n
    +///    10: Bits [95:64] of parameter \a X are returned. \n
    +///    11: Bits [127:96] of parameter \a X are returned.
    +/// \returns A 32-bit integer containing the extracted 32 bits of float 
data.
    +#define _mm_extract_ps(X, N) (__extension__                      \
    +  ({ union { int __i; float __f; } __t;  \
    +     __t.__f = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); 
\
    +     __t.__i;}))
    +
    +/* Miscellaneous insert and extract macros.  */
    +/* Extract a single-precision float from X at index N into D.  */
    +#define _MM_EXTRACT_FLOAT(D, X, N) \
    +  { (D) = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); }
    +
    +/* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to 
create
    +   an index suitable for _mm_insert_ps.  */
    +#define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) | ((Y) << 4) | (Z))
    +
    +/* Extract a float from X at index N into the first index of the return.  
*/
    +#define _MM_PICK_OUT_PS(X, N) _mm_insert_ps (_mm_setzero_ps(), (X),   \
    +                                             _MM_MK_INSERTPS_NDX((N), 0, 
0x0e))
    +
    +/* Insert int into packed integer array at index.  */
    +/// Constructs a 128-bit vector of [16 x i8] by first making a copy of
    +///    the 128-bit integer vector parameter, and then inserting the lower 
8 bits
    +///    of an integer parameter \a I into an offset specified by the 
immediate
    +///    value parameter \a N.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m128i _mm_insert_epi8(__m128i X, int I, const int N);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VPINSRB / PINSRB </c> 
instruction.
    +///
    +/// \param X
    +///    A 128-bit integer vector of [16 x i8]. This vector is copied to the
    +///    result and then one of the sixteen elements in the result vector is
    +///    replaced by the lower 8 bits of \a I.
    +/// \param I
    +///    An integer. The lower 8 bits of this operand are written to the 
result
    +///    beginning at the offset specified by \a N.
    +/// \param N
    +///    An immediate value. Bits [3:0] specify the bit offset in the result 
at
    +///    which the lower 8 bits of \a I are written. \n
    +///    0000: Bits [7:0] of the result are used for insertion. \n
    +///    0001: Bits [15:8] of the result are used for insertion. \n
    +///    0010: Bits [23:16] of the result are used for insertion. \n
    +///    0011: Bits [31:24] of the result are used for insertion. \n
    +///    0100: Bits [39:32] of the result are used for insertion. \n
    +///    0101: Bits [47:40] of the result are used for insertion. \n
    +///    0110: Bits [55:48] of the result are used for insertion. \n
    +///    0111: Bits [63:56] of the result are used for insertion. \n
    +///    1000: Bits [71:64] of the result are used for insertion. \n
    +///    1001: Bits [79:72] of the result are used for insertion. \n
    +///    1010: Bits [87:80] of the result are used for insertion. \n
    +///    1011: Bits [95:88] of the result are used for insertion. \n
    +///    1100: Bits [103:96] of the result are used for insertion. \n
    +///    1101: Bits [111:104] of the result are used for insertion. \n
    +///    1110: Bits [119:112] of the result are used for insertion. \n
    +///    1111: Bits [127:120] of the result are used for insertion.
    +/// \returns A 128-bit integer vector containing the constructed values.
    +#define _mm_insert_epi8(X, I, N) \
    +  (__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(X), \
    +                                        (int)(I), (int)(N))
    +
    +/// Constructs a 128-bit vector of [4 x i32] by first making a copy of
    +///    the 128-bit integer vector parameter, and then inserting the 32-bit
    +///    integer parameter \a I at the offset specified by the immediate 
value
    +///    parameter \a N.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m128i _mm_insert_epi32(__m128i X, int I, const int N);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VPINSRD / PINSRD </c> 
instruction.
    +///
    +/// \param X
    +///    A 128-bit integer vector of [4 x i32]. This vector is copied to the
    +///    result and then one of the four elements in the result vector is
    +///    replaced by \a I.
    +/// \param I
    +///    A 32-bit integer that is written to the result beginning at the 
offset
    +///    specified by \a N.
    +/// \param N
    +///    An immediate value. Bits [1:0] specify the bit offset in the result 
at
    +///    which the integer \a I is written. \n
    +///    00: Bits [31:0] of the result are used for insertion. \n
    +///    01: Bits [63:32] of the result are used for insertion. \n
    +///    10: Bits [95:64] of the result are used for insertion. \n
    +///    11: Bits [127:96] of the result are used for insertion.
    +/// \returns A 128-bit integer vector containing the constructed values.
    +#define _mm_insert_epi32(X, I, N) \
    +  (__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(X), \
    +                                       (int)(I), (int)(N))
    +
    +#ifdef __x86_64__
    +/// Constructs a 128-bit vector of [2 x i64] by first making a copy of
    +///    the 128-bit integer vector parameter, and then inserting the 64-bit
    +///    integer parameter \a I, using the immediate value parameter \a N as 
an
    +///    insertion location selector.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m128i _mm_insert_epi64(__m128i X, long long I, const int N);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VPINSRQ / PINSRQ </c> 
instruction.
    +///
    +/// \param X
    +///    A 128-bit integer vector of [2 x i64]. This vector is copied to the
    +///    result and then one of the two elements in the result vector is 
replaced
    +///    by \a I.
    +/// \param I
    +///    A 64-bit integer that is written to the result beginning at the 
offset
    +///    specified by \a N.
    +/// \param N
    +///    An immediate value. Bit [0] specifies the bit offset in the result 
at
    +///    which the integer \a I is written. \n
    +///    0: Bits [63:0] of the result are used for insertion. \n
    +///    1: Bits [127:64] of the result are used for insertion. \n
    +/// \returns A 128-bit integer vector containing the constructed values.
    +#define _mm_insert_epi64(X, I, N) \
    +  (__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(X), \
    +                                       (long long)(I), (int)(N))
    +#endif /* __x86_64__ */
    +
    +/* Extract int from packed integer array at index.  This returns the 
element
    + * as a zero extended value, so it is unsigned.
    + */
    +/// Extracts an 8-bit element from the 128-bit integer vector of
    +///    [16 x i8], using the immediate value parameter \a N as a selector.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// int _mm_extract_epi8(__m128i X, const int N);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VPEXTRB / PEXTRB </c> 
instruction.
    +///
    +/// \param X
    +///    A 128-bit integer vector.
    +/// \param N
    +///    An immediate value. Bits [3:0] specify which 8-bit vector element 
from
    +///    the argument \a X to extract and copy to the result. \n
    +///    0000: Bits [7:0] of parameter \a X are extracted. \n
    +///    0001: Bits [15:8] of the parameter \a X are extracted. \n
    +///    0010: Bits [23:16] of the parameter \a X are extracted. \n
    +///    0011: Bits [31:24] of the parameter \a X are extracted. \n
    +///    0100: Bits [39:32] of the parameter \a X are extracted. \n
    +///    0101: Bits [47:40] of the parameter \a X are extracted. \n
    +///    0110: Bits [55:48] of the parameter \a X are extracted. \n
    +///    0111: Bits [63:56] of the parameter \a X are extracted. \n
    +///    1000: Bits [71:64] of the parameter \a X are extracted. \n
    +///    1001: Bits [79:72] of the parameter \a X are extracted. \n
    +///    1010: Bits [87:80] of the parameter \a X are extracted. \n
    +///    1011: Bits [95:88] of the parameter \a X are extracted. \n
    +///    1100: Bits [103:96] of the parameter \a X are extracted. \n
    +///    1101: Bits [111:104] of the parameter \a X are extracted. \n
    +///    1110: Bits [119:112] of the parameter \a X are extracted. \n
    +///    1111: Bits [127:120] of the parameter \a X are extracted.
    +/// \returns  An unsigned integer, whose lower 8 bits are selected from the
    +///    128-bit integer vector parameter and the remaining bits are assigned
    +///    zeros.
    +#define _mm_extract_epi8(X, N) \
    +  (int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X), \
    +                                                   (int)(N))
    +
    +/// Extracts a 32-bit element from the 128-bit integer vector of
    +///    [4 x i32], using the immediate value parameter \a N as a selector.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// int _mm_extract_epi32(__m128i X, const int N);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VPEXTRD / PEXTRD </c> 
instruction.
    +///
    +/// \param X
    +///    A 128-bit integer vector.
    +/// \param N
    +///    An immediate value. Bits [1:0] specify which 32-bit vector element 
from
    +///    the argument \a X to extract and copy to the result. \n
    +///    00: Bits [31:0] of the parameter \a X are extracted. \n
    +///    01: Bits [63:32] of the parameter \a X are extracted. \n
    +///    10: Bits [95:64] of the parameter \a X are extracted. \n
    +///    11: Bits [127:96] of the parameter \a X are exracted.
    +/// \returns  An integer, whose lower 32 bits are selected from the 128-bit
    +///    integer vector parameter and the remaining bits are assigned zeros.
    +#define _mm_extract_epi32(X, N) \
    +  (int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(X), (int)(N))
    +
    +#ifdef __x86_64__
    +/// Extracts a 64-bit element from the 128-bit integer vector of
    +///    [2 x i64], using the immediate value parameter \a N as a selector.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// long long _mm_extract_epi64(__m128i X, const int N);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> 
instruction.
    +///
    +/// \param X
    +///    A 128-bit integer vector.
    +/// \param N
    +///    An immediate value. Bit [0] specifies which 64-bit vector element 
from
    +///    the argument \a X to return. \n
    +///    0: Bits [63:0] are returned. \n
    +///    1: Bits [127:64] are returned. \n
    +/// \returns  A 64-bit integer.
    +#define _mm_extract_epi64(X, N) \
    +  (long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N))
    +#endif /* __x86_64 */
    +
    +/* SSE4 128-bit Packed Integer Comparisons.  */
    +/// Tests whether the specified bits in a 128-bit integer vector are all
    +///    zeros.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
    +///
    +/// \param __M
    +///    A 128-bit integer vector containing the bits to be tested.
    +/// \param __V
    +///    A 128-bit integer vector selecting which bits to test in operand \a 
__M.
    +/// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
    +static __inline__ int __DEFAULT_FN_ATTRS
    +_mm_testz_si128(__m128i __M, __m128i __V)
    +{
    +  return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V);
    +}
    +
    +/// Tests whether the specified bits in a 128-bit integer vector are all
    +///    ones.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
    +///
    +/// \param __M
    +///    A 128-bit integer vector containing the bits to be tested.
    +/// \param __V
    +///    A 128-bit integer vector selecting which bits to test in operand \a 
__M.
    +/// \returns TRUE if the specified bits are all ones; FALSE otherwise.
    +static __inline__ int __DEFAULT_FN_ATTRS
    +_mm_testc_si128(__m128i __M, __m128i __V)
    +{
    +  return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V);
    +}
    +
    +/// Tests whether the specified bits in a 128-bit integer vector are
    +///    neither all zeros nor all ones.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
    +///
    +/// \param __M
    +///    A 128-bit integer vector containing the bits to be tested.
    +/// \param __V
    +///    A 128-bit integer vector selecting which bits to test in operand \a 
__M.
    +/// \returns TRUE if the specified bits are neither all zeros nor all ones;
    +///    FALSE otherwise.
    +static __inline__ int __DEFAULT_FN_ATTRS
    +_mm_testnzc_si128(__m128i __M, __m128i __V)
    +{
    +  return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V);
    +}
    +
    +/// Tests whether the specified bits in a 128-bit integer vector are all
    +///    ones.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// int _mm_test_all_ones(__m128i V);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
    +///
    +/// \param V
    +///    A 128-bit integer vector containing the bits to be tested.
    +/// \returns TRUE if the bits specified in the operand are all set to 1; 
FALSE
    +///    otherwise.
    +#define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), 
(V)))
    +
    +/// Tests whether the specified bits in a 128-bit integer vector are
    +///    neither all zeros nor all ones.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// int _mm_test_mix_ones_zeros(__m128i M, __m128i V);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
    +///
    +/// \param M
    +///    A 128-bit integer vector containing the bits to be tested.
    +/// \param V
    +///    A 128-bit integer vector selecting which bits to test in operand \a 
M.
    +/// \returns TRUE if the specified bits are neither all zeros nor all ones;
    +///    FALSE otherwise.
    +#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))
    +
    +/// Tests whether the specified bits in a 128-bit integer vector are all
    +///    zeros.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// int _mm_test_all_zeros(__m128i M, __m128i V);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
    +///
    +/// \param M
    +///    A 128-bit integer vector containing the bits to be tested.
    +/// \param V
    +///    A 128-bit integer vector selecting which bits to test in operand \a 
M.
    +/// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
    +#define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V))
    +
    +/* SSE4 64-bit Packed Integer Comparisons.  */
    +/// Compares each of the corresponding 64-bit values of the 128-bit
    +///    integer vectors for equality.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPCMPEQQ / PCMPEQQ </c> 
instruction.
    +///
    +/// \param __V1
    +///    A 128-bit integer vector.
    +/// \param __V2
    +///    A 128-bit integer vector.
    +/// \returns A 128-bit integer vector containing the comparison results.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_cmpeq_epi64(__m128i __V1, __m128i __V2)
    +{
    +  return (__m128i)((__v2di)__V1 == (__v2di)__V2);
    +}
    +
    +/* SSE4 Packed Integer Sign-Extension.  */
    +/// Sign-extends each of the lower eight 8-bit integer elements of a
    +///    128-bit vector of [16 x i8] to 16-bit values and returns them in a
    +///    128-bit vector of [8 x i16]. The upper eight elements of the input 
vector
    +///    are unused.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPMOVSXBW / PMOVSXBW </c> 
instruction.
    +///
    +/// \param __V
    +///    A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are 
sign-
    +///    extended to 16-bit values.
    +/// \returns A 128-bit vector of [8 x i16] containing the sign-extended 
values.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_cvtepi8_epi16(__m128i __V)
    +{
    +#ifdef __GNUC__
    +  return (__m128i) __builtin_ia32_pmovsxbw128 ((__v16qi)__V);
    +#else
    +  /* This function always performs a signed extension, but __v16qi is a 
char
    +     which may be signed or unsigned, so use __v16qs. */
    +  return 
(__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, 
(__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi);
    +#endif
    +}
    +
    +/// Sign-extends each of the lower four 8-bit integer elements of a
    +///    128-bit vector of [16 x i8] to 32-bit values and returns them in a
    +///    128-bit vector of [4 x i32]. The upper twelve elements of the input
    +///    vector are unused.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPMOVSXBD / PMOVSXBD </c> 
instruction.
    +///
    +/// \param __V
    +///    A 128-bit vector of [16 x i8]. The lower four 8-bit elements are
    +///    sign-extended to 32-bit values.
    +/// \returns A 128-bit vector of [4 x i32] containing the sign-extended 
values.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_cvtepi8_epi32(__m128i __V)
    +{
    +  /* This function always performs a signed extension, but __v16qi is a 
char
    +     which may be signed or unsigned, so use __v16qs. */
    +#ifdef __GNUC__
    +  return (__m128i) __builtin_ia32_pmovsxbd128 ((__v16qi)__V);
    +#else
    +  return 
(__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, 
(__v16qs)__V, 0, 1, 2, 3), __v4si);
    +#endif
    +}
    +
    +/// Sign-extends each of the lower two 8-bit integer elements of a
    +///    128-bit integer vector of [16 x i8] to 64-bit values and returns 
them in
    +///    a 128-bit vector of [2 x i64]. The upper fourteen elements of the 
input
    +///    vector are unused.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPMOVSXBQ / PMOVSXBQ </c> 
instruction.
    +///
    +/// \param __V
    +///    A 128-bit vector of [16 x i8]. The lower two 8-bit elements are
    +///    sign-extended to 64-bit values.
    +/// \returns A 128-bit vector of [2 x i64] containing the sign-extended 
values.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_cvtepi8_epi64(__m128i __V)
    +{
    +#ifdef __GNUC__
    +  return (__m128i) __builtin_ia32_pmovsxbq128 ((__v16qi)__V);
    +#else
    +  /* This function always performs a signed extension, but __v16qi is a 
char
    +     which may be signed or unsigned, so use __v16qs. */
    +  return 
(__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, 
(__v16qs)__V, 0, 1), __v2di);
    +#endif
    +}
    +
    +/// Sign-extends each of the lower four 16-bit integer elements of a
    +///    128-bit integer vector of [8 x i16] to 32-bit values and returns 
them in
    +///    a 128-bit vector of [4 x i32]. The upper four elements of the input
    +///    vector are unused.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPMOVSXWD / PMOVSXWD </c> 
instruction.
    +///
    +/// \param __V
    +///    A 128-bit vector of [8 x i16]. The lower four 16-bit elements are
    +///    sign-extended to 32-bit values.
    +/// \returns A 128-bit vector of [4 x i32] containing the sign-extended 
values.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_cvtepi16_epi32(__m128i __V)
    +{
    +#ifdef __GNUC__
    +  return (__m128i) __builtin_ia32_pmovsxwd128 ((__v8hi)__V);
    +#else
    +  return 
(__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, 
(__v8hi)__V, 0, 1, 2, 3), __v4si);
    +#endif
    +}
    +
    +/// Sign-extends each of the lower two 16-bit integer elements of a
    +///    128-bit integer vector of [8 x i16] to 64-bit values and returns 
them in
    +///    a 128-bit vector of [2 x i64]. The upper six elements of the input
    +///    vector are unused.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPMOVSXWQ / PMOVSXWQ </c> 
instruction.
    +///
    +/// \param __V
    +///    A 128-bit vector of [8 x i16]. The lower two 16-bit elements are
    +///     sign-extended to 64-bit values.
    +/// \returns A 128-bit vector of [2 x i64] containing the sign-extended 
values.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_cvtepi16_epi64(__m128i __V)
    +{
    +#ifdef __GNUC__
    +  return (__m128i) __builtin_ia32_pmovsxwq128 ((__v8hi)__V);
    +#else
    +  return 
(__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, 
(__v8hi)__V, 0, 1), __v2di);
    +#endif
    +}
    +
    +/// Sign-extends each of the lower two 32-bit integer elements of a
    +///    128-bit integer vector of [4 x i32] to 64-bit values and returns 
them in
    +///    a 128-bit vector of [2 x i64]. The upper two elements of the input 
vector
    +///    are unused.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPMOVSXDQ / PMOVSXDQ </c> 
instruction.
    +///
    +/// \param __V
    +///    A 128-bit vector of [4 x i32]. The lower two 32-bit elements are
    +///    sign-extended to 64-bit values.
    +/// \returns A 128-bit vector of [2 x i64] containing the sign-extended 
values.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_cvtepi32_epi64(__m128i __V)
    +{
    +#ifdef __GNUC__
    +  return (__m128i) __builtin_ia32_pmovsxdq128 ((__v4si)__V);
    +#else
    +  return 
(__m128i)__builtin_convertvector(__builtin_shufflevector((__v4si)__V, 
(__v4si)__V, 0, 1), __v2di);
    +#endif
    +}
    +
    +/* SSE4 Packed Integer Zero-Extension.  */
    +/// Zero-extends each of the lower eight 8-bit integer elements of a
    +///    128-bit vector of [16 x i8] to 16-bit values and returns them in a
    +///    128-bit vector of [8 x i16]. The upper eight elements of the input 
vector
    +///    are unused.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPMOVZXBW / PMOVZXBW </c> 
instruction.
    +///
    +/// \param __V
    +///    A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are
    +///    zero-extended to 16-bit values.
    +/// \returns A 128-bit vector of [8 x i16] containing the zero-extended 
values.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_cvtepu8_epi16(__m128i __V)
    +{
    +#ifdef __GNUC__
    +  return (__m128i) __builtin_ia32_pmovzxbw128 ((__v16qi)__V);
    +#else
    +  return 
(__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, 
(__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi);
    +#endif
    +}
    +
    +/// Zero-extends each of the lower four 8-bit integer elements of a
    +///    128-bit vector of [16 x i8] to 32-bit values and returns them in a
    +///    128-bit vector of [4 x i32]. The upper twelve elements of the input
    +///    vector are unused.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPMOVZXBD / PMOVZXBD </c> 
instruction.
    +///
    +/// \param __V
    +///    A 128-bit vector of [16 x i8]. The lower four 8-bit elements are
    +///    zero-extended to 32-bit values.
    +/// \returns A 128-bit vector of [4 x i32] containing the zero-extended 
values.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_cvtepu8_epi32(__m128i __V)
    +{
    +#ifdef __GNUC__
    +  return (__m128i) __builtin_ia32_pmovzxbd128 ((__v16qi)__V);
    +#else
    +  return 
(__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, 
(__v16qu)__V, 0, 1, 2, 3), __v4si);
    +#endif
    +}
    +
    +/// Zero-extends each of the lower two 8-bit integer elements of a
    +///    128-bit integer vector of [16 x i8] to 64-bit values and returns 
them in
    +///    a 128-bit vector of [2 x i64]. The upper fourteen elements of the 
input
    +///    vector are unused.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPMOVZXBQ / PMOVZXBQ </c> 
instruction.
    +///
    +/// \param __V
    +///    A 128-bit vector of [16 x i8]. The lower two 8-bit elements are
    +///    zero-extended to 64-bit values.
    +/// \returns A 128-bit vector of [2 x i64] containing the zero-extended 
values.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_cvtepu8_epi64(__m128i __V)
    +{
    +#ifdef __GNUC__
    +  return (__m128i) __builtin_ia32_pmovzxbq128 ((__v16qi)__V);
    +#else
    +  return 
(__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, 
(__v16qu)__V, 0, 1), __v2di);
    +#endif
    +}
    +
    +/// Zero-extends each of the lower four 16-bit integer elements of a
    +///    128-bit integer vector of [8 x i16] to 32-bit values and returns 
them in
    +///    a 128-bit vector of [4 x i32]. The upper four elements of the input
    +///    vector are unused.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPMOVZXWD / PMOVZXWD </c> 
instruction.
    +///
    +/// \param __V
    +///    A 128-bit vector of [8 x i16]. The lower four 16-bit elements are
    +///    zero-extended to 32-bit values.
    +/// \returns A 128-bit vector of [4 x i32] containing the zero-extended 
values.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_cvtepu16_epi32(__m128i __V)
    +{
    +#ifdef __GNUC__
    +  return (__m128i) __builtin_ia32_pmovzxwd128 ((__v8hi)__V);
    +#else
    +  return 
(__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, 
(__v8hu)__V, 0, 1, 2, 3), __v4si);
    +#endif
    +}
    +
    +/// Zero-extends each of the lower two 16-bit integer elements of a
    +///    128-bit integer vector of [8 x i16] to 64-bit values and returns 
them in
    +///    a 128-bit vector of [2 x i64]. The upper six elements of the input 
vector
    +///    are unused.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPMOVZXWQ / PMOVZXWQ </c> 
instruction.
    +///
    +/// \param __V
    +///    A 128-bit vector of [8 x i16]. The lower two 16-bit elements are
    +///    zero-extended to 64-bit values.
    +/// \returns A 128-bit vector of [2 x i64] containing the zero-extended 
values.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_cvtepu16_epi64(__m128i __V)
    +{
    +#ifdef __GNUC__
    +  return (__m128i) __builtin_ia32_pmovzxwq128 ((__v8hi)__V);
    +#else
    +  return 
(__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, 
(__v8hu)__V, 0, 1), __v2di);
    +#endif
    +}
    +
    +/// Zero-extends each of the lower two 32-bit integer elements of a
    +///    128-bit integer vector of [4 x i32] to 64-bit values and returns 
them in
    +///    a 128-bit vector of [2 x i64]. The upper two elements of the input 
vector
    +///    are unused.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPMOVZXDQ / PMOVZXDQ </c> 
instruction.
    +///
    +/// \param __V
    +///    A 128-bit vector of [4 x i32]. The lower two 32-bit elements are
    +///    zero-extended to 64-bit values.
    +/// \returns A 128-bit vector of [2 x i64] containing the zero-extended 
values.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_cvtepu32_epi64(__m128i __V)
    +{
    +#ifdef __GNUC__
    +  return (__m128i) __builtin_ia32_pmovzxdq128 ((__v4si)__V);
    +#else
    +  return 
(__m128i)__builtin_convertvector(__builtin_shufflevector((__v4su)__V, 
(__v4su)__V, 0, 1), __v2di);
    +#endif
    +}
    +
    +/* SSE4 Pack with Unsigned Saturation.  */
    +/// Converts 32-bit signed integers from both 128-bit integer vector
    +///    operands into 16-bit unsigned integers, and returns the packed 
result.
    +///    Values greater than 0xFFFF are saturated to 0xFFFF. Values less than
    +///    0x0000 are saturated to 0x0000.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPACKUSDW / PACKUSDW </c> 
instruction.
    +///
    +/// \param __V1
    +///    A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a
    +///    signed integer and is converted to a 16-bit unsigned integer with
    +///    saturation. Values greater than 0xFFFF are saturated to 0xFFFF. 
Values
    +///    less than 0x0000 are saturated to 0x0000. The converted [4 x i16] 
values
    +///    are written to the lower 64 bits of the result.
    +/// \param __V2
    +///    A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a
    +///    signed integer and is converted to a 16-bit unsigned integer with
    +///    saturation. Values greater than 0xFFFF are saturated to 0xFFFF. 
Values
    +///    less than 0x0000 are saturated to 0x0000. The converted [4 x i16] 
values
    +///    are written to the higher 64 bits of the result.
    +/// \returns A 128-bit vector of [8 x i16] containing the converted values.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_packus_epi32(__m128i __V1, __m128i __V2)
    +{
    +  return (__m128i) __builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2);
    +}
    +
    +/* SSE4 Multiple Packed Sums of Absolute Difference.  */
    +/// Subtracts 8-bit unsigned integer values and computes the absolute
    +///    values of the differences to the corresponding bits in the 
destination.
    +///    Then sums of the absolute differences are returned according to the 
bit
    +///    fields in the immediate operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m128i _mm_mpsadbw_epu8(__m128i X, __m128i Y, const int M);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VMPSADBW / MPSADBW </c> 
instruction.
    +///
    +/// \param X
    +///    A 128-bit vector of [16 x i8].
    +/// \param Y
    +///    A 128-bit vector of [16 x i8].
    +/// \param M
    +///    An 8-bit immediate operand specifying how the absolute differences 
are to
    +///    be calculated, according to the following algorithm:
    +///    \code
    +///    // M2 represents bit 2 of the immediate operand
    +///    // M10 represents bits [1:0] of the immediate operand
    +///    i = M2 * 4;
    +///    j = M10 * 4;
    +///    for (k = 0; k < 8; k = k + 1) {
    +///      d0 = abs(X[i + k + 0] - Y[j + 0]);
    +///      d1 = abs(X[i + k + 1] - Y[j + 1]);
    +///      d2 = abs(X[i + k + 2] - Y[j + 2]);
    +///      d3 = abs(X[i + k + 3] - Y[j + 3]);
    +///      r[k] = d0 + d1 + d2 + d3;
    +///    }
    +///    \endcode
    +/// \returns A 128-bit integer vector containing the sums of the sets of
    +///    absolute differences between both operands.
    +#define _mm_mpsadbw_epu8(X, Y, M) \
    +  (__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \
    +                                      (__v16qi)(__m128i)(Y), (M))
    +
    +/// Finds the minimum unsigned 16-bit element in the input 128-bit
    +///    vector of [8 x u16] and returns it and along with its index.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPHMINPOSUW / PHMINPOSUW </c>
    +/// instruction.
    +///
    +/// \param __V
    +///    A 128-bit vector of [8 x u16].
    +/// \returns A 128-bit value where bits [15:0] contain the minimum value 
found
    +///    in parameter \a __V, bits [18:16] contain the index of the minimum 
value
    +///    and the remaining bits are set to 0.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_minpos_epu16(__m128i __V)
    +{
    +  return (__m128i) __builtin_ia32_phminposuw128((__v8hi)__V);
    +}
    +
    +/* Handle the sse4.2 definitions here. */
    +
    +/* These definitions are normally in nmmintrin.h, but gcc puts them in here
    +   so we'll do the same.  */
    +
    +#undef __DEFAULT_FN_ATTRS
    +#ifdef __GNUC__
    +#define __DEFAULT_FN_ATTRS __attribute__((__gnu_inline__, 
__always_inline__, __artificial__))
    +#else
    +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, 
__target__("sse4.2")))
    +#endif
    +
    +/* These specify the type of data that we're comparing.  */
    +#define _SIDD_UBYTE_OPS                 0x00
    +#define _SIDD_UWORD_OPS                 0x01
    +#define _SIDD_SBYTE_OPS                 0x02
    +#define _SIDD_SWORD_OPS                 0x03
    +
    +/* These specify the type of comparison operation.  */
    +#define _SIDD_CMP_EQUAL_ANY             0x00
    +#define _SIDD_CMP_RANGES                0x04
    +#define _SIDD_CMP_EQUAL_EACH            0x08
    +#define _SIDD_CMP_EQUAL_ORDERED         0x0c
    +
    +/* These macros specify the polarity of the operation.  */
    +#define _SIDD_POSITIVE_POLARITY         0x00
    +#define _SIDD_NEGATIVE_POLARITY         0x10
    +#define _SIDD_MASKED_POSITIVE_POLARITY  0x20
    +#define _SIDD_MASKED_NEGATIVE_POLARITY  0x30
    +
    +/* These macros are used in _mm_cmpXstri() to specify the return.  */
    +#define _SIDD_LEAST_SIGNIFICANT         0x00
    +#define _SIDD_MOST_SIGNIFICANT          0x40
    +
    +/* These macros are used in _mm_cmpXstri() to specify the return.  */
    +#define _SIDD_BIT_MASK                  0x00
    +#define _SIDD_UNIT_MASK                 0x40
    +
    +/* SSE4.2 Packed Comparison Intrinsics.  */
    +/// Uses the immediate operand \a M to perform a comparison of string
    +///    data with implicitly defined lengths that is contained in source 
operands
    +///    \a A and \a B. Returns a 128-bit integer vector representing the 
result
    +///    mask of the comparison.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m128i _mm_cmpistrm(__m128i A, __m128i B, const int M);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VPCMPISTRM / PCMPISTRM </c>
    +/// instruction.
    +///
    +/// \param A
    +///    A 128-bit integer vector containing one of the source operands to be
    +///    compared.
    +/// \param B
    +///    A 128-bit integer vector containing one of the source operands to be
    +///    compared.
    +/// \param M
    +///    An 8-bit immediate operand specifying whether the characters are 
bytes or
    +///    words, the type of comparison to perform, and the format of the 
return
    +///    value. \n
    +///    Bits [1:0]: Determine source data format. \n
    +///      00: 16 unsigned bytes \n
    +///      01: 8 unsigned words \n
    +///      10: 16 signed bytes \n
    +///      11: 8 signed words \n
    +///    Bits [3:2]: Determine comparison type and aggregation method. \n
    +///      00: Subset: Each character in \a B is compared for equality with 
all
    +///          the characters in \a A. \n
    +///      01: Ranges: Each character in \a B is compared to \a A. The 
comparison
    +///          basis is greater than or equal for even-indexed elements in 
\a A,
    +///          and less than or equal for odd-indexed elements in \a A. \n
    +///      10: Match: Compare each pair of corresponding characters in \a A 
and
    +///          \a B for equality. \n
    +///      11: Substring: Search \a B for substring matches of \a A. \n
    +///    Bits [5:4]: Determine whether to perform a one's complement on the 
bit
    +///                mask of the comparison results. \n
    +///      00: No effect. \n
    +///      01: Negate the bit mask. \n
    +///      10: No effect. \n
    +///      11: Negate the bit mask only for bits with an index less than or 
equal
    +///          to the size of \a A or \a B. \n
    +///    Bit [6]: Determines whether the result is zero-extended or expanded 
to 16
    +///             bytes. \n
    +///      0: The result is zero-extended to 16 bytes. \n
    +///      1: The result is expanded to 16 bytes (this expansion is 
performed by
    +///         repeating each bit 8 or 16 times).
    +/// \returns Returns a 128-bit integer vector representing the result mask 
of
    +///    the comparison.
    +#define _mm_cmpistrm(A, B, M) \
    +  (__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \
    +                                       (__v16qi)(__m128i)(B), (int)(M))
    +
    +/// Uses the immediate operand \a M to perform a comparison of string
    +///    data with implicitly defined lengths that is contained in source 
operands
    +///    \a A and \a B. Returns an integer representing the result index of 
the
    +///    comparison.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// int _mm_cmpistri(__m128i A, __m128i B, const int M);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
    +/// instruction.
    +///
    +/// \param A
    +///    A 128-bit integer vector containing one of the source operands to be
    +///    compared.
    +/// \param B
    +///    A 128-bit integer vector containing one of the source operands to be
    +///    compared.
    +/// \param M
    +///    An 8-bit immediate operand specifying whether the characters are 
bytes or
    +///    words, the type of comparison to perform, and the format of the 
return
    +///    value. \n
    +///    Bits [1:0]: Determine source data format. \n
    +///      00: 16 unsigned bytes \n
    +///      01: 8 unsigned words \n
    +///      10: 16 signed bytes \n
    +///      11: 8 signed words \n
    +///    Bits [3:2]: Determine comparison type and aggregation method. \n
    +///      00: Subset: Each character in \a B is compared for equality with 
all
    +///          the characters in \a A. \n
    +///      01: Ranges: Each character in \a B is compared to \a A. The 
comparison
    +///          basis is greater than or equal for even-indexed elements in 
\a A,
    +///          and less than or equal for odd-indexed elements in \a A. \n
    +///      10: Match: Compare each pair of corresponding characters in \a A 
and
    +///          \a B for equality. \n
    +///      11: Substring: Search B for substring matches of \a A. \n
    +///    Bits [5:4]: Determine whether to perform a one's complement on the 
bit
    +///                mask of the comparison results. \n
    +///      00: No effect. \n
    +///      01: Negate the bit mask. \n
    +///      10: No effect. \n
    +///      11: Negate the bit mask only for bits with an index less than or 
equal
    +///          to the size of \a A or \a B. \n
    +///    Bit [6]: Determines whether the index of the lowest set bit or the
    +///             highest set bit is returned. \n
    +///      0: The index of the least significant set bit. \n
    +///      1: The index of the most significant set bit. \n
    +/// \returns Returns an integer representing the result index of the 
comparison.
    +#define _mm_cmpistri(A, B, M) \
    +  (int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \
    +                                   (__v16qi)(__m128i)(B), (int)(M))
    +
    +/// Uses the immediate operand \a M to perform a comparison of string
    +///    data with explicitly defined lengths that is contained in source 
operands
    +///    \a A and \a B. Returns a 128-bit integer vector representing the 
result
    +///    mask of the comparison.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m128i _mm_cmpestrm(__m128i A, int LA, __m128i B, int LB, const int 
M);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VPCMPESTRM / PCMPESTRM </c>
    +/// instruction.
    +///
    +/// \param A
    +///    A 128-bit integer vector containing one of the source operands to be
    +///    compared.
    +/// \param LA
    +///    An integer that specifies the length of the string in \a A.
    +/// \param B
    +///    A 128-bit integer vector containing one of the source operands to be
    +///    compared.
    +/// \param LB
    +///    An integer that specifies the length of the string in \a B.
    +/// \param M
    +///    An 8-bit immediate operand specifying whether the characters are 
bytes or
    +///    words, the type of comparison to perform, and the format of the 
return
    +///    value. \n
    +///    Bits [1:0]: Determine source data format. \n
    +///      00: 16 unsigned bytes \n
    +///      01: 8 unsigned words \n
    +///      10: 16 signed bytes \n
    +///      11: 8 signed words \n
    +///    Bits [3:2]: Determine comparison type and aggregation method. \n
    +///      00: Subset: Each character in \a B is compared for equality with 
all
    +///          the characters in \a A. \n
    +///      01: Ranges: Each character in \a B is compared to \a A. The 
comparison
    +///          basis is greater than or equal for even-indexed elements in 
\a A,
    +///          and less than or equal for odd-indexed elements in \a A. \n
    +///      10: Match: Compare each pair of corresponding characters in \a A 
and
    +///          \a B for equality. \n
    +///      11: Substring: Search \a B for substring matches of \a A. \n
    +///    Bits [5:4]: Determine whether to perform a one's complement on the 
bit
    +///                mask of the comparison results. \n
    +///      00: No effect. \n
    +///      01: Negate the bit mask. \n
    +///      10: No effect. \n
    +///      11: Negate the bit mask only for bits with an index less than or 
equal
    +///          to the size of \a A or \a B. \n
    +///    Bit [6]: Determines whether the result is zero-extended or expanded 
to 16
    +///             bytes. \n
    +///      0: The result is zero-extended to 16 bytes. \n
    +///      1: The result is expanded to 16 bytes (this expansion is 
performed by
    +///         repeating each bit 8 or 16 times). \n
    +/// \returns Returns a 128-bit integer vector representing the result mask 
of
    +///    the comparison.
    +#define _mm_cmpestrm(A, LA, B, LB, M) \
    +  (__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \
    +                                       (__v16qi)(__m128i)(B), (int)(LB), \
    +                                       (int)(M))
    +
    +/// Uses the immediate operand \a M to perform a comparison of string
    +///    data with explicitly defined lengths that is contained in source 
operands
    +///    \a A and \a B. Returns an integer representing the result index of 
the
    +///    comparison.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// int _mm_cmpestri(__m128i A, int LA, __m128i B, int LB, const int M);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
    +/// instruction.
    +///
    +/// \param A
    +///    A 128-bit integer vector containing one of the source operands to be
    +///    compared.
    +/// \param LA
    +///    An integer that specifies the length of the string in \a A.
    +/// \param B
    +///    A 128-bit integer vector containing one of the source operands to be
    +///    compared.
    +/// \param LB
    +///    An integer that specifies the length of the string in \a B.
    +/// \param M
    +///    An 8-bit immediate operand specifying whether the characters are 
bytes or
    +///    words, the type of comparison to perform, and the format of the 
return
    +///    value. \n
    +///    Bits [1:0]: Determine source data format. \n
    +///      00: 16 unsigned bytes \n
    +///      01: 8 unsigned words \n
    +///      10: 16 signed bytes \n
    +///      11: 8 signed words \n
    +///    Bits [3:2]: Determine comparison type and aggregation method. \n
    +///      00: Subset: Each character in \a B is compared for equality with 
all
    +///          the characters in \a A. \n
    +///      01: Ranges: Each character in \a B is compared to \a A. The 
comparison
    +///          basis is greater than or equal for even-indexed elements in 
\a A,
    +///          and less than or equal for odd-indexed elements in \a A. \n
    +///      10: Match: Compare each pair of corresponding characters in \a A 
and
    +///          \a B for equality. \n
    +///      11: Substring: Search B for substring matches of \a A. \n
    +///    Bits [5:4]: Determine whether to perform a one's complement on the 
bit
    +///                mask of the comparison results. \n
    +///      00: No effect. \n
    +///      01: Negate the bit mask. \n
    +///      10: No effect. \n
    +///      11: Negate the bit mask only for bits with an index less than or 
equal
    +///          to the size of \a A or \a B. \n
    +///    Bit [6]: Determines whether the index of the lowest set bit or the
    +///             highest set bit is returned. \n
    +///      0: The index of the least significant set bit. \n
    +///      1: The index of the most significant set bit. \n
    +/// \returns Returns an integer representing the result index of the 
comparison.
    +#define _mm_cmpestri(A, LA, B, LB, M) \
    +  (int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \
    +                                   (__v16qi)(__m128i)(B), (int)(LB), \
    +                                   (int)(M))
    +
    +/* SSE4.2 Packed Comparison Intrinsics and EFlag Reading.  */
    +/// Uses the immediate operand \a M to perform a comparison of string
    +///    data with implicitly defined lengths that is contained in source 
operands
    +///    \a A and \a B. Returns 1 if the bit mask is zero and the length of 
the
    +///    string in \a B is the maximum, otherwise, returns 0.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// int _mm_cmpistra(__m128i A, __m128i B, const int M);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
    +/// instruction.
    +///
    +/// \param A
    +///    A 128-bit integer vector containing one of the source operands to be
    +///    compared.
    +/// \param B
    +///    A 128-bit integer vector containing one of the source operands to be
    +///    compared.
    +/// \param M
    +///    An 8-bit immediate operand specifying whether the characters are 
bytes or
    +///    words and the type of comparison to perform. \n
    +///    Bits [1:0]: Determine source data format. \n
    +///      00: 16 unsigned bytes \n
    +///      01: 8 unsigned words \n
    +///      10: 16 signed bytes \n
    +///      11: 8 signed words \n
    +///    Bits [3:2]: Determine comparison type and aggregation method. \n
    +///      00: Subset: Each character in \a B is compared for equality with 
all
    +///          the characters in \a A. \n
    +///      01: Ranges: Each character in \a B is compared to \a A. The 
comparison
    +///          basis is greater than or equal for even-indexed elements in 
\a A,
    +///          and less than or equal for odd-indexed elements in \a A. \n
    +///      10: Match: Compare each pair of corresponding characters in \a A 
and
    +///          \a B for equality. \n
    +///      11: Substring: Search \a B for substring matches of \a A. \n
    +///    Bits [5:4]: Determine whether to perform a one's complement on the 
bit
    +///                mask of the comparison results. \n
    +///      00: No effect. \n
    +///      01: Negate the bit mask. \n
    +///      10: No effect. \n
    +///      11: Negate the bit mask only for bits with an index less than or 
equal
    +///          to the size of \a A or \a B. \n
    +/// \returns Returns 1 if the bit mask is zero and the length of the 
string in
    +///    \a B is the maximum; otherwise, returns 0.
    +#define _mm_cmpistra(A, B, M) \
    +  (int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \
    +                                    (__v16qi)(__m128i)(B), (int)(M))
    +
    +/// Uses the immediate operand \a M to perform a comparison of string
    +///    data with implicitly defined lengths that is contained in source 
operands
    +///    \a A and \a B. Returns 1 if the bit mask is non-zero, otherwise, 
returns
    +///    0.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// int _mm_cmpistrc(__m128i A, __m128i B, const int M);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
    +/// instruction.
    +///
    +/// \param A
    +///    A 128-bit integer vector containing one of the source operands to be
    +///    compared.
    +/// \param B
    +///    A 128-bit integer vector containing one of the source operands to be
    +///    compared.
    +/// \param M
    +///    An 8-bit immediate operand specifying whether the characters are 
bytes or
    +///    words and the type of comparison to perform. \n
    +///    Bits [1:0]: Determine source data format. \n
    +///      00: 16 unsigned bytes \n
    +///      01: 8 unsigned words \n
    +///      10: 16 signed bytes \n
    +///      11: 8 signed words \n
    +///    Bits [3:2]: Determine comparison type and aggregation method. \n
    +///      00: Subset: Each character in \a B is compared for equality with 
all
    +///          the characters in \a A. \n
    +///      01: Ranges: Each character in \a B is compared to \a A. The 
comparison
    +///          basis is greater than or equal for even-indexed elements in 
\a A,
    +///          and less than or equal for odd-indexed elements in \a A. \n
    +///      10: Match: Compare each pair of corresponding characters in \a A 
and
    +///          \a B for equality. \n
    +///      11: Substring: Search B for substring matches of \a A. \n
    +///    Bits [5:4]: Determine whether to perform a one's complement on the 
bit
    +///                mask of the comparison results. \n
    +///      00: No effect. \n
    +///      01: Negate the bit mask. \n
    +///      10: No effect. \n
    +///      11: Negate the bit mask only for bits with an index less than or 
equal
    +///          to the size of \a A or \a B.
    +/// \returns Returns 1 if the bit mask is non-zero, otherwise, returns 0.
    +#define _mm_cmpistrc(A, B, M) \
    +  (int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \
    +                                    (__v16qi)(__m128i)(B), (int)(M))
    +
    +/// Uses the immediate operand \a M to perform a comparison of string
    +///    data with implicitly defined lengths that is contained in source 
operands
    +///    \a A and \a B. Returns bit 0 of the resulting bit mask.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// int _mm_cmpistro(__m128i A, __m128i B, const int M);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
    +/// instruction.
    +///
    +/// \param A
    +///    A 128-bit integer vector containing one of the source operands to be
    +///    compared.
    +/// \param B
    +///    A 128-bit integer vector containing one of the source operands to be
    +///    compared.
    +/// \param M
    +///    An 8-bit immediate operand specifying whether the characters are 
bytes or
    +///    words and the type of comparison to perform. \n
    +///    Bits [1:0]: Determine source data format. \n
    +///      00: 16 unsigned bytes \n
    +///      01: 8 unsigned words \n
    +///      10: 16 signed bytes \n
    +///      11: 8 signed words \n
    +///    Bits [3:2]: Determine comparison type and aggregation method. \n
    +///      00: Subset: Each character in \a B is compared for equality with 
all
    +///          the characters in \a A. \n
    +///      01: Ranges: Each character in \a B is compared to \a A. The 
comparison
    +///          basis is greater than or equal for even-indexed elements in 
\a A,
    +///          and less than or equal for odd-indexed elements in \a A. \n
    +///      10: Match: Compare each pair of corresponding characters in \a A 
and
    +///          \a B for equality. \n
    +///      11: Substring: Search B for substring matches of \a A. \n
    +///    Bits [5:4]: Determine whether to perform a one's complement on the 
bit
    +///                mask of the comparison results. \n
    +///      00: No effect. \n
    +///      01: Negate the bit mask. \n
    +///      10: No effect. \n
    +///      11: Negate the bit mask only for bits with an index less than or 
equal
    +///          to the size of \a A or \a B. \n
    +/// \returns Returns bit 0 of the resulting bit mask.
    +#define _mm_cmpistro(A, B, M) \
    +  (int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \
    +                                    (__v16qi)(__m128i)(B), (int)(M))
    +
    +/// Uses the immediate operand \a M to perform a comparison of string
    +///    data with implicitly defined lengths that is contained in source 
operands
    +///    \a A and \a B. Returns 1 if the length of the string in \a A is 
less than
    +///    the maximum, otherwise, returns 0.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// int _mm_cmpistrs(__m128i A, __m128i B, const int M);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
    +/// instruction.
    +///
    +/// \param A
    +///    A 128-bit integer vector containing one of the source operands to be
    +///    compared.
    +/// \param B
    +///    A 128-bit integer vector containing one of the source operands to be
    +///    compared.
    +/// \param M
    +///    An 8-bit immediate operand specifying whether the characters are 
bytes or
    +///    words and the type of comparison to perform. \n
    +///    Bits [1:0]: Determine source data format. \n
    +///      00: 16 unsigned bytes \n
    +///      01: 8 unsigned words \n
    +///      10: 16 signed bytes \n
    +///      11: 8 signed words \n
    +///    Bits [3:2]: Determine comparison type and aggregation method. \n
    +///      00: Subset: Each character in \a B is compared for equality with 
all
    +///          the characters in \a A. \n
    +///      01: Ranges: Each character in \a B is compared to \a A. The 
comparison
    +///          basis is greater than or equal for even-indexed elements in 
\a A,
    +///          and less than or equal for odd-indexed elements in \a A. \n
    +///      10: Match: Compare each pair of corresponding characters in \a A 
and
    +///          \a B for equality. \n
    +///      11: Substring: Search \a B for substring matches of \a A. \n
    +///    Bits [5:4]: Determine whether to perform a one's complement on the 
bit
    +///                mask of the comparison results. \n
    +///      00: No effect. \n
    +///      01: Negate the bit mask. \n
    +///      10: No effect. \n
    +///      11: Negate the bit mask only for bits with an index less than or 
equal
    +///          to the size of \a A or \a B. \n
    +/// \returns Returns 1 if the length of the string in \a A is less than the
    +///    maximum, otherwise, returns 0.
    +#define _mm_cmpistrs(A, B, M) \
    +  (int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \
    +                                    (__v16qi)(__m128i)(B), (int)(M))
    +
    +/// Uses the immediate operand \a M to perform a comparison of string
    +///    data with implicitly defined lengths that is contained in source 
operands
    +///    \a A and \a B. Returns 1 if the length of the string in \a B is 
less than
    +///    the maximum, otherwise, returns 0.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// int _mm_cmpistrz(__m128i A, __m128i B, const int M);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
    +/// instruction.
    +///
    +/// \param A
    +///    A 128-bit integer vector containing one of the source operands to be
    +///    compared.
    +/// \param B
    +///    A 128-bit integer vector containing one of the source operands to be
    +///    compared.
    +/// \param M
    +///    An 8-bit immediate operand specifying whether the characters are 
bytes or
    +///    words and the type of comparison to perform. \n
    +///    Bits [1:0]: Determine source data format. \n
    +///      00: 16 unsigned bytes \n
    +///      01: 8 unsigned words \n
    +///      10: 16 signed bytes \n
    +///      11: 8 signed words \n
    +///    Bits [3:2]: Determine comparison type and aggregation method. \n
    +///      00: Subset: Each character in \a B is compared for equality with 
all
    +///          the characters in \a A. \n
    +///      01: Ranges: Each character in \a B is compared to \a A. The 
comparison
    +///          basis is greater than or equal for even-indexed elements in 
\a A,
    +///          and less than or equal for odd-indexed elements in \a A. \n
    +///      10: Match: Compare each pair of corresponding characters in \a A 
and
    +///          \a B for equality. \n
    +///      11: Substring: Search \a B for substring matches of \a A. \n
    +///    Bits [5:4]: Determine whether to perform a one's complement on the 
bit
    +///                mask of the comparison results. \n
    +///      00: No effect. \n
    +///      01: Negate the bit mask. \n
    +///      10: No effect. \n
    +///      11: Negate the bit mask only for bits with an index less than or 
equal
    +///          to the size of \a A or \a B.
    +/// \returns Returns 1 if the length of the string in \a B is less than the
    +///    maximum, otherwise, returns 0.
    +#define _mm_cmpistrz(A, B, M) \
    +  (int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \
    +                                    (__v16qi)(__m128i)(B), (int)(M))
    +
    +/// Uses the immediate operand \a M to perform a comparison of string
    +///    data with explicitly defined lengths that is contained in source 
operands
    +///    \a A and \a B. Returns 1 if the bit mask is zero and the length of 
the
    +///    string in \a B is the maximum, otherwise, returns 0.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// int _mm_cmpestra(__m128i A, int LA, __m128i B, int LB, const int M);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
    +/// instruction.
    +///
    +/// \param A
    +///    A 128-bit integer vector containing one of the source operands to be
    +///    compared.
    +/// \param LA
    +///    An integer that specifies the length of the string in \a A.
    +/// \param B
    +///    A 128-bit integer vector containing one of the source operands to be
    +///    compared.
    +/// \param LB
    +///    An integer that specifies the length of the string in \a B.
    +/// \param M
    +///    An 8-bit immediate operand specifying whether the characters are 
bytes or
    +///    words and the type of comparison to perform. \n
    +///    Bits [1:0]: Determine source data format. \n
    +///      00: 16 unsigned bytes \n
    +///      01: 8 unsigned words \n
    +///      10: 16 signed bytes \n
    +///      11: 8 signed words \n
    +///    Bits [3:2]: Determine comparison type and aggregation method. \n
    +///      00: Subset: Each character in \a B is compared for equality with 
all
    +///          the characters in \a A. \n
    +///      01: Ranges: Each character in \a B is compared to \a A. The 
comparison
    +///          basis is greater than or equal for even-indexed elements in 
\a A,
    +///          and less than or equal for odd-indexed elements in \a A. \n
    +///      10: Match: Compare each pair of corresponding characters in \a A 
and
    +///          \a B for equality. \n
    +///      11: Substring: Search \a B for substring matches of \a A. \n
    +///    Bits [5:4]: Determine whether to perform a one's complement on the 
bit
    +///                mask of the comparison results. \n
    +///      00: No effect. \n
    +///      01: Negate the bit mask. \n
    +///      10: No effect. \n
    +///      11: Negate the bit mask only for bits with an index less than or 
equal
    +///          to the size of \a A or \a B.
    +/// \returns Returns 1 if the bit mask is zero and the length of the 
string in
    +///    \a B is the maximum, otherwise, returns 0.
    +#define _mm_cmpestra(A, LA, B, LB, M) \
    +  (int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \
    +                                    (__v16qi)(__m128i)(B), (int)(LB), \
    +                                    (int)(M))
    +
    +/// Uses the immediate operand \a M to perform a comparison of string
    +///    data with explicitly defined lengths that is contained in source 
operands
    +///    \a A and \a B. Returns 1 if the resulting mask is non-zero, 
otherwise,
    +///    returns 0.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// int _mm_cmpestrc(__m128i A, int LA, __m128i B, int LB, const int M);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
    +/// instruction.
    +///
    +/// \param A
    +///    A 128-bit integer vector containing one of the source operands to be
    +///    compared.
    +/// \param LA
    +///    An integer that specifies the length of the string in \a A.
    +/// \param B
    +///    A 128-bit integer vector containing one of the source operands to be
    +///    compared.
    +/// \param LB
    +///    An integer that specifies the length of the string in \a B.
    +/// \param M
    +///    An 8-bit immediate operand specifying whether the characters are 
bytes or
    +///    words and the type of comparison to perform. \n
    +///    Bits [1:0]: Determine source data format. \n
    +///      00: 16 unsigned bytes \n
    +///      01: 8 unsigned words \n
    +///      10: 16 signed bytes \n
    +///      11: 8 signed words \n
    +///    Bits [3:2]: Determine comparison type and aggregation method. \n
    +///      00: Subset: Each character in \a B is compared for equality with 
all
    +///          the characters in \a A. \n
    +///      01: Ranges: Each character in \a B is compared to \a A. The 
comparison
    +///          basis is greater than or equal for even-indexed elements in 
\a A,
    +///          and less than or equal for odd-indexed elements in \a A. \n
    +///      10: Match: Compare each pair of corresponding characters in \a A 
and
    +///          \a B for equality. \n
    +///      11: Substring: Search \a B for substring matches of \a A. \n
    +///    Bits [5:4]: Determine whether to perform a one's complement on the 
bit
    +///                mask of the comparison results. \n
    +///      00: No effect. \n
    +///      01: Negate the bit mask. \n
    +///      10: No effect. \n
    +///      11: Negate the bit mask only for bits with an index less than or 
equal
    +///          to the size of \a A or \a B. \n
    +/// \returns Returns 1 if the resulting mask is non-zero, otherwise, 
returns 0.
    +#define _mm_cmpestrc(A, LA, B, LB, M) \
    +  (int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \
    +                                    (__v16qi)(__m128i)(B), (int)(LB), \
    +                                    (int)(M))
    +
    +/// Uses the immediate operand \a M to perform a comparison of string
    +///    data with explicitly defined lengths that is contained in source 
operands
    +///    \a A and \a B. Returns bit 0 of the resulting bit mask.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// int _mm_cmpestro(__m128i A, int LA, __m128i B, int LB, const int M);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
    +/// instruction.
    +///
    +/// \param A
    +///    A 128-bit integer vector containing one of the source operands to be
    +///    compared.
    +/// \param LA
    +///    An integer that specifies the length of the string in \a A.
    +/// \param B
    +///    A 128-bit integer vector containing one of the source operands to be
    +///    compared.
    +/// \param LB
    +///    An integer that specifies the length of the string in \a B.
    +/// \param M
    +///    An 8-bit immediate operand specifying whether the characters are 
bytes or
    +///    words and the type of comparison to perform. \n
    +///    Bits [1:0]: Determine source data format. \n
    +///      00: 16 unsigned bytes \n
    +///      01: 8 unsigned words \n
    +///      10: 16 signed bytes \n
    +///      11: 8 signed words \n
    +///    Bits [3:2]: Determine comparison type and aggregation method. \n
    +///      00: Subset: Each character in \a B is compared for equality with 
all
    +///          the characters in \a A. \n
    +///      01: Ranges: Each character in \a B is compared to \a A. The 
comparison
    +///          basis is greater than or equal for even-indexed elements in 
\a A,
    +///          and less than or equal for odd-indexed elements in \a A. \n
    +///      10: Match: Compare each pair of corresponding characters in \a A 
and
    +///          \a B for equality. \n
    +///      11: Substring: Search \a B for substring matches of \a A. \n
    +///    Bits [5:4]: Determine whether to perform a one's complement on the 
bit
    +///                mask of the comparison results. \n
    +///      00: No effect. \n
    +///      01: Negate the bit mask. \n
    +///      10: No effect. \n
    +///      11: Negate the bit mask only for bits with an index less than or 
equal
    +///          to the size of \a A or \a B.
    +/// \returns Returns bit 0 of the resulting bit mask.
    +#define _mm_cmpestro(A, LA, B, LB, M) \
    +  (int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \
    +                                    (__v16qi)(__m128i)(B), (int)(LB), \
    +                                    (int)(M))
    +
    +/// Uses the immediate operand \a M to perform a comparison of string
    +///    data with explicitly defined lengths that is contained in source 
operands
    +///    \a A and \a B. Returns 1 if the length of the string in \a A is 
less than
    +///    the maximum, otherwise, returns 0.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// int _mm_cmpestrs(__m128i A, int LA, __m128i B, int LB, const int M);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
    +/// instruction.
    +///
    +/// \param A
    +///    A 128-bit integer vector containing one of the source operands to be
    +///    compared.
    +/// \param LA
    +///    An integer that specifies the length of the string in \a A.
    +/// \param B
    +///    A 128-bit integer vector containing one of the source operands to be
    +///    compared.
    +/// \param LB
    +///    An integer that specifies the length of the string in \a B.
    +/// \param M
    +///    An 8-bit immediate operand specifying whether the characters are 
bytes or
    +///    words and the type of comparison to perform. \n
    +///    Bits [1:0]: Determine source data format. \n
    +///      00: 16 unsigned bytes \n
    +///      01: 8 unsigned words \n
    +///      10: 16 signed bytes \n
    +///      11: 8 signed words \n
    +///    Bits [3:2]: Determine comparison type and aggregation method. \n
    +///      00: Subset: Each character in \a B is compared for equality with 
all
    +///          the characters in \a A. \n
    +///      01: Ranges: Each character in \a B is compared to \a A. The 
comparison
    +///          basis is greater than or equal for even-indexed elements in 
\a A,
    +///          and less than or equal for odd-indexed elements in \a A. \n
    +///      10: Match: Compare each pair of corresponding characters in \a A 
and
    +///          \a B for equality. \n
    +///      11: Substring: Search \a B for substring matches of \a A. \n
    +///    Bits [5:4]: Determine whether to perform a one's complement in the 
bit
    +///                mask of the comparison results. \n
    +///      00: No effect. \n
    +///      01: Negate the bit mask. \n
    +///      10: No effect. \n
    +///      11: Negate the bit mask only for bits with an index less than or 
equal
    +///          to the size of \a A or \a B. \n
    +/// \returns Returns 1 if the length of the string in \a A is less than the
    +///    maximum, otherwise, returns 0.
    +#define _mm_cmpestrs(A, LA, B, LB, M) \
    +  (int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \
    +                                    (__v16qi)(__m128i)(B), (int)(LB), \
    +                                    (int)(M))
    +
    +/// Uses the immediate operand \a M to perform a comparison of string
    +///    data with explicitly defined lengths that is contained in source 
operands
    +///    \a A and \a B. Returns 1 if the length of the string in \a B is 
less than
    +///    the maximum, otherwise, returns 0.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// int _mm_cmpestrz(__m128i A, int LA, __m128i B, int LB, const int M);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VPCMPESTRI </c> instruction.
    +///
    +/// \param A
    +///    A 128-bit integer vector containing one of the source operands to be
    +///    compared.
    +/// \param LA
    +///    An integer that specifies the length of the string in \a A.
    +/// \param B
    +///    A 128-bit integer vector containing one of the source operands to be
    +///    compared.
    +/// \param LB
    +///    An integer that specifies the length of the string in \a B.
    +/// \param M
    +///    An 8-bit immediate operand specifying whether the characters are 
bytes or
    +///    words and the type of comparison to perform. \n
    +///    Bits [1:0]: Determine source data format. \n
    +///      00: 16 unsigned bytes  \n
    +///      01: 8 unsigned words \n
    +///      10: 16 signed bytes \n
    +///      11: 8 signed words \n
    +///    Bits [3:2]: Determine comparison type and aggregation method. \n
    +///      00: Subset: Each character in \a B is compared for equality with 
all
    +///          the characters in \a A. \n
    +///      01: Ranges: Each character in \a B is compared to \a A. The 
comparison
    +///          basis is greater than or equal for even-indexed elements in 
\a A,
    +///          and less than or equal for odd-indexed elements in \a A. \n
    +///      10: Match: Compare each pair of corresponding characters in \a A 
and
    +///          \a B for equality. \n
    +///      11: Substring: Search \a B for substring matches of \a A. \n
    +///    Bits [5:4]: Determine whether to perform a one's complement on the 
bit
    +///                mask of the comparison results. \n
    +///      00: No effect. \n
    +///      01: Negate the bit mask. \n
    +///      10: No effect. \n
    +///      11: Negate the bit mask only for bits with an index less than or 
equal
    +///          to the size of \a A or \a B.
    +/// \returns Returns 1 if the length of the string in \a B is less than the
    +///    maximum, otherwise, returns 0.
    +#define _mm_cmpestrz(A, LA, B, LB, M) \
    +  (int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \
    +                                    (__v16qi)(__m128i)(B), (int)(LB), \
    +                                    (int)(M))
    +
    +/* SSE4.2 Compare Packed Data -- Greater Than.  */
    +/// Compares each of the corresponding 64-bit values of the 128-bit
    +///    integer vectors to determine if the values in the first operand are
    +///    greater than those in the second operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPCMPGTQ / PCMPGTQ </c> 
instruction.
    +///
    +/// \param __V1
    +///    A 128-bit integer vector.
    +/// \param __V2
    +///    A 128-bit integer vector.
    +/// \returns A 128-bit integer vector containing the comparison results.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_cmpgt_epi64(__m128i __V1, __m128i __V2)
    +{
    +  return (__m128i)((__v2di)__V1 > (__v2di)__V2);
    +}
    +
    +/* SSE4.2 Accumulate CRC32.  */
    +/// Adds the unsigned integer operand to the CRC-32C checksum of the
    +///    unsigned char operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> CRC32B </c> instruction.
    +///
    +/// \param __C
    +///    An unsigned integer operand to add to the CRC-32C checksum of 
operand
    +///    \a  __D.
    +/// \param __D
    +///    An unsigned 8-bit integer operand used to compute the CRC-32C 
checksum.
    +/// \returns The result of adding operand \a __C to the CRC-32C checksum of
    +///    operand \a __D.
    +static __inline__ unsigned int __DEFAULT_FN_ATTRS
    +_mm_crc32_u8(unsigned int __C, unsigned char __D)
    +{
    +  return __builtin_ia32_crc32qi(__C, __D);
    +}
    +
    +/// Adds the unsigned integer operand to the CRC-32C checksum of the
    +///    unsigned short operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> CRC32W </c> instruction.
    +///
    +/// \param __C
    +///    An unsigned integer operand to add to the CRC-32C checksum of 
operand
    +///    \a __D.
    +/// \param __D
    +///    An unsigned 16-bit integer operand used to compute the CRC-32C 
checksum.
    +/// \returns The result of adding operand \a __C to the CRC-32C checksum of
    +///    operand \a __D.
    +static __inline__ unsigned int __DEFAULT_FN_ATTRS
    +_mm_crc32_u16(unsigned int __C, unsigned short __D)
    +{
    +  return __builtin_ia32_crc32hi(__C, __D);
    +}
    +
    +/// Adds the first unsigned integer operand to the CRC-32C checksum of
    +///    the second unsigned integer operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> CRC32L </c> instruction.
    +///
    +/// \param __C
    +///    An unsigned integer operand to add to the CRC-32C checksum of 
operand
    +///    \a __D.
    +/// \param __D
    +///    An unsigned 32-bit integer operand used to compute the CRC-32C 
checksum.
    +/// \returns The result of adding operand \a __C to the CRC-32C checksum of
    +///    operand \a __D.
    +static __inline__ unsigned int __DEFAULT_FN_ATTRS
    +_mm_crc32_u32(unsigned int __C, unsigned int __D)
    +{
    +  return __builtin_ia32_crc32si(__C, __D);
    +}
    +
    +#ifdef __x86_64__
    +/// Adds the unsigned integer operand to the CRC-32C checksum of the
    +///    unsigned 64-bit integer operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> CRC32Q </c> instruction.
    +///
    +/// \param __C
    +///    An unsigned integer operand to add to the CRC-32C checksum of 
operand
    +///    \a __D.
    +/// \param __D
    +///    An unsigned 64-bit integer operand used to compute the CRC-32C 
checksum.
    +/// \returns The result of adding operand \a __C to the CRC-32C checksum of
    +///    operand \a __D.
    +static __inline__ unsigned long long __DEFAULT_FN_ATTRS
    +_mm_crc32_u64(unsigned long long __C, unsigned long long __D)
    +{
    +  return __builtin_ia32_crc32di(__C, __D);
    +}
    +#endif /* __x86_64__ */
    +
    +#undef __DEFAULT_FN_ATTRS
    +
    +#include <popcntintrin.h>
    +
    +#endif /* __SMMINTRIN_H */
    diff --git a/include/tmmintrin.h b/include/tmmintrin.h
    new file mode 100644
    index 0000000..7a94096
    --- /dev/null
    +++ b/include/tmmintrin.h
    @@ -0,0 +1,790 @@
    +/*===---- tmmintrin.h - SSSE3 intrinsics 
-----------------------------------===
    + *
    + * Permission is hereby granted, free of charge, to any person obtaining a 
copy
    + * of this software and associated documentation files (the "Software"), 
to deal
    + * in the Software without restriction, including without limitation the 
rights
    + * to use, copy, modify, merge, publish, distribute, sublicense, and/or 
sell
    + * copies of the Software, and to permit persons to whom the Software is
    + * furnished to do so, subject to the following conditions:
    + *
    + * The above copyright notice and this permission notice shall be included 
in
    + * all copies or substantial portions of the Software.
    + *
    + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 
OR
    + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
THE
    + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
    + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 
IN
    + * THE SOFTWARE.
    + *
    + 
*===-----------------------------------------------------------------------===
    + */
    +
    +#ifndef __TMMINTRIN_H
    +#define __TMMINTRIN_H
    +
    +#include <pmmintrin.h>
    +
    +/* Define the default attributes for the functions in this file. */
    +#ifdef __GNUC__
    +#define __DEFAULT_FN_ATTRS __attribute__((__gnu_inline__, 
__always_inline__, __artificial__))
    +#define __DEFAULT_FN_ATTRS_MMX __attribute__((__gnu_inline__, 
__always_inline__, __artificial__))
    +#else
    +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, 
__target__("ssse3"), __min_vector_width__(64)))
    +#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, 
__nodebug__, __target__("mmx,ssse3"), __min_vector_width__(64)))
    +#endif
    +
    +/// Computes the absolute value of each of the packed 8-bit signed
    +///    integers in the source operand and stores the 8-bit unsigned integer
    +///    results in the destination.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the \c PABSB instruction.
    +///
    +/// \param __a
    +///    A 64-bit vector of [8 x i8].
    +/// \returns A 64-bit integer vector containing the absolute values of the
    +///    elements in the operand.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
    +_mm_abs_pi8(__m64 __a)
    +{
    +    return (__m64)__builtin_ia32_pabsb((__v8qi)__a);
    +}
    +
    +/// Computes the absolute value of each of the packed 8-bit signed
    +///    integers in the source operand and stores the 8-bit unsigned integer
    +///    results in the destination.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the \c VPABSB instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [16 x i8].
    +/// \returns A 128-bit integer vector containing the absolute values of the
    +///    elements in the operand.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_abs_epi8(__m128i __a)
    +{
    +    return (__m128i)__builtin_ia32_pabsb128((__v16qi)__a);
    +}
    +
    +/// Computes the absolute value of each of the packed 16-bit signed
    +///    integers in the source operand and stores the 16-bit unsigned 
integer
    +///    results in the destination.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the \c PABSW instruction.
    +///
    +/// \param __a
    +///    A 64-bit vector of [4 x i16].
    +/// \returns A 64-bit integer vector containing the absolute values of the
    +///    elements in the operand.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
    +_mm_abs_pi16(__m64 __a)
    +{
    +    return (__m64)__builtin_ia32_pabsw((__v4hi)__a);
    +}
    +
    +/// Computes the absolute value of each of the packed 16-bit signed
    +///    integers in the source operand and stores the 16-bit unsigned 
integer
    +///    results in the destination.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the \c VPABSW instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [8 x i16].
    +/// \returns A 128-bit integer vector containing the absolute values of the
    +///    elements in the operand.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_abs_epi16(__m128i __a)
    +{
    +    return (__m128i)__builtin_ia32_pabsw128((__v8hi)__a);
    +}
    +
    +/// Computes the absolute value of each of the packed 32-bit signed
    +///    integers in the source operand and stores the 32-bit unsigned 
integer
    +///    results in the destination.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the \c PABSD instruction.
    +///
    +/// \param __a
    +///    A 64-bit vector of [2 x i32].
    +/// \returns A 64-bit integer vector containing the absolute values of the
    +///    elements in the operand.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
    +_mm_abs_pi32(__m64 __a)
    +{
    +    return (__m64)__builtin_ia32_pabsd((__v2si)__a);
    +}
    +
    +/// Computes the absolute value of each of the packed 32-bit signed
    +///    integers in the source operand and stores the 32-bit unsigned 
integer
    +///    results in the destination.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the \c VPABSD instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x i32].
    +/// \returns A 128-bit integer vector containing the absolute values of the
    +///    elements in the operand.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_abs_epi32(__m128i __a)
    +{
    +    return (__m128i)__builtin_ia32_pabsd128((__v4si)__a);
    +}
    +
    +/// Concatenates the two 128-bit integer vector operands, and
    +///    right-shifts the result by the number of bytes specified in the 
immediate
    +///    operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m128i _mm_alignr_epi8(__m128i a, __m128i b, const int n);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the \c PALIGNR instruction.
    +///
    +/// \param a
    +///    A 128-bit vector of [16 x i8] containing one of the source operands.
    +/// \param b
    +///    A 128-bit vector of [16 x i8] containing one of the source operands.
    +/// \param n
    +///    An immediate operand specifying how many bytes to right-shift the 
result.
    +/// \returns A 128-bit integer vector containing the concatenated 
right-shifted
    +///    value.
    +#define _mm_alignr_epi8(a, b, n) \
    +  (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \
    +                                     (__v16qi)(__m128i)(b), (n))
    +
    +/// Concatenates the two 64-bit integer vector operands, and right-shifts
    +///    the result by the number of bytes specified in the immediate 
operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m64 _mm_alignr_pi8(__m64 a, __m64 b, const int n);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the \c PALIGNR instruction.
    +///
    +/// \param a
    +///    A 64-bit vector of [8 x i8] containing one of the source operands.
    +/// \param b
    +///    A 64-bit vector of [8 x i8] containing one of the source operands.
    +/// \param n
    +///    An immediate operand specifying how many bytes to right-shift the 
result.
    +/// \returns A 64-bit integer vector containing the concatenated 
right-shifted
    +///    value.
    +#define _mm_alignr_pi8(a, b, n) \
    +  (__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), 
(n))
    +
    +/// Horizontally adds the adjacent pairs of values contained in 2 packed
    +///    128-bit vectors of [8 x i16].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the \c VPHADDW instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [8 x i16] containing one of the source 
operands. The
    +///    horizontal sums of the values are stored in the lower bits of the
    +///    destination.
    +/// \param __b
    +///    A 128-bit vector of [8 x i16] containing one of the source 
operands. The
    +///    horizontal sums of the values are stored in the upper bits of the
    +///    destination.
    +/// \returns A 128-bit vector of [8 x i16] containing the horizontal sums 
of
    +///    both operands.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_hadd_epi16(__m128i __a, __m128i __b)
    +{
    +    return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b);
    +}
    +
    +/// Horizontally adds the adjacent pairs of values contained in 2 packed
    +///    128-bit vectors of [4 x i32].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the \c VPHADDD instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x i32] containing one of the source 
operands. The
    +///    horizontal sums of the values are stored in the lower bits of the
    +///    destination.
    +/// \param __b
    +///    A 128-bit vector of [4 x i32] containing one of the source 
operands. The
    +///    horizontal sums of the values are stored in the upper bits of the
    +///    destination.
    +/// \returns A 128-bit vector of [4 x i32] containing the horizontal sums 
of
    +///    both operands.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_hadd_epi32(__m128i __a, __m128i __b)
    +{
    +    return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b);
    +}
    +
    +/// Horizontally adds the adjacent pairs of values contained in 2 packed
    +///    64-bit vectors of [4 x i16].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the \c PHADDW instruction.
    +///
    +/// \param __a
    +///    A 64-bit vector of [4 x i16] containing one of the source operands. 
The
    +///    horizontal sums of the values are stored in the lower bits of the
    +///    destination.
    +/// \param __b
    +///    A 64-bit vector of [4 x i16] containing one of the source operands. 
The
    +///    horizontal sums of the values are stored in the upper bits of the
    +///    destination.
    +/// \returns A 64-bit vector of [4 x i16] containing the horizontal sums 
of both
    +///    operands.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
    +_mm_hadd_pi16(__m64 __a, __m64 __b)
    +{
    +    return (__m64)__builtin_ia32_phaddw((__v4hi)__a, (__v4hi)__b);
    +}
    +
    +/// Horizontally adds the adjacent pairs of values contained in 2 packed
    +///    64-bit vectors of [2 x i32].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the \c PHADDD instruction.
    +///
    +/// \param __a
    +///    A 64-bit vector of [2 x i32] containing one of the source operands. 
The
    +///    horizontal sums of the values are stored in the lower bits of the
    +///    destination.
    +/// \param __b
    +///    A 64-bit vector of [2 x i32] containing one of the source operands. 
The
    +///    horizontal sums of the values are stored in the upper bits of the
    +///    destination.
    +/// \returns A 64-bit vector of [2 x i32] containing the horizontal sums 
of both
    +///    operands.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
    +_mm_hadd_pi32(__m64 __a, __m64 __b)
    +{
    +    return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b);
    +}
    +
    +/// Horizontally adds the adjacent pairs of values contained in 2 packed
    +///    128-bit vectors of [8 x i16]. Positive sums greater than 0x7FFF are
    +///    saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
    +///    0x8000.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the \c VPHADDSW instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [8 x i16] containing one of the source 
operands. The
    +///    horizontal sums of the values are stored in the lower bits of the
    +///    destination.
    +/// \param __b
    +///    A 128-bit vector of [8 x i16] containing one of the source 
operands. The
    +///    horizontal sums of the values are stored in the upper bits of the
    +///    destination.
    +/// \returns A 128-bit vector of [8 x i16] containing the horizontal 
saturated
    +///    sums of both operands.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_hadds_epi16(__m128i __a, __m128i __b)
    +{
    +    return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b);
    +}
    +
    +/// Horizontally adds the adjacent pairs of values contained in 2 packed
    +///    64-bit vectors of [4 x i16]. Positive sums greater than 0x7FFF are
    +///    saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
    +///    0x8000.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the \c PHADDSW instruction.
    +///
    +/// \param __a
    +///    A 64-bit vector of [4 x i16] containing one of the source operands. 
The
    +///    horizontal sums of the values are stored in the lower bits of the
    +///    destination.
    +/// \param __b
    +///    A 64-bit vector of [4 x i16] containing one of the source operands. 
The
    +///    horizontal sums of the values are stored in the upper bits of the
    +///    destination.
    +/// \returns A 64-bit vector of [4 x i16] containing the horizontal 
saturated
    +///    sums of both operands.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
    +_mm_hadds_pi16(__m64 __a, __m64 __b)
    +{
    +    return (__m64)__builtin_ia32_phaddsw((__v4hi)__a, (__v4hi)__b);
    +}
    +
    +/// Horizontally subtracts the adjacent pairs of values contained in 2
    +///    packed 128-bit vectors of [8 x i16].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the \c VPHSUBW instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [8 x i16] containing one of the source 
operands. The
    +///    horizontal differences between the values are stored in the lower 
bits of
    +///    the destination.
    +/// \param __b
    +///    A 128-bit vector of [8 x i16] containing one of the source 
operands. The
    +///    horizontal differences between the values are stored in the upper 
bits of
    +///    the destination.
    +/// \returns A 128-bit vector of [8 x i16] containing the horizontal 
differences
    +///    of both operands.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_hsub_epi16(__m128i __a, __m128i __b)
    +{
    +    return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b);
    +}
    +
    +/// Horizontally subtracts the adjacent pairs of values contained in 2
    +///    packed 128-bit vectors of [4 x i32].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the \c VPHSUBD instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x i32] containing one of the source 
operands. The
    +///    horizontal differences between the values are stored in the lower 
bits of
    +///    the destination.
    +/// \param __b
    +///    A 128-bit vector of [4 x i32] containing one of the source 
operands. The
    +///    horizontal differences between the values are stored in the upper 
bits of
    +///    the destination.
    +/// \returns A 128-bit vector of [4 x i32] containing the horizontal 
differences
    +///    of both operands.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_hsub_epi32(__m128i __a, __m128i __b)
    +{
    +    return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b);
    +}
    +
    +/// Horizontally subtracts the adjacent pairs of values contained in 2
    +///    packed 64-bit vectors of [4 x i16].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the \c PHSUBW instruction.
    +///
    +/// \param __a
    +///    A 64-bit vector of [4 x i16] containing one of the source operands. 
The
    +///    horizontal differences between the values are stored in the lower 
bits of
    +///    the destination.
    +/// \param __b
    +///    A 64-bit vector of [4 x i16] containing one of the source operands. 
The
    +///    horizontal differences between the values are stored in the upper 
bits of
    +///    the destination.
    +/// \returns A 64-bit vector of [4 x i16] containing the horizontal 
differences
    +///    of both operands.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
    +_mm_hsub_pi16(__m64 __a, __m64 __b)
    +{
    +    return (__m64)__builtin_ia32_phsubw((__v4hi)__a, (__v4hi)__b);
    +}
    +
    +/// Horizontally subtracts the adjacent pairs of values contained in 2
    +///    packed 64-bit vectors of [2 x i32].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the \c PHSUBD instruction.
    +///
    +/// \param __a
    +///    A 64-bit vector of [2 x i32] containing one of the source operands. 
The
    +///    horizontal differences between the values are stored in the lower 
bits of
    +///    the destination.
    +/// \param __b
    +///    A 64-bit vector of [2 x i32] containing one of the source operands. 
The
    +///    horizontal differences between the values are stored in the upper 
bits of
    +///    the destination.
    +/// \returns A 64-bit vector of [2 x i32] containing the horizontal 
differences
    +///    of both operands.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
    +_mm_hsub_pi32(__m64 __a, __m64 __b)
    +{
    +    return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b);
    +}
    +
    +/// Horizontally subtracts the adjacent pairs of values contained in 2
    +///    packed 128-bit vectors of [8 x i16]. Positive differences greater 
than
    +///    0x7FFF are saturated to 0x7FFF. Negative differences less than 
0x8000 are
    +///    saturated to 0x8000.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the \c VPHSUBSW instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [8 x i16] containing one of the source 
operands. The
    +///    horizontal differences between the values are stored in the lower 
bits of
    +///    the destination.
    +/// \param __b
    +///    A 128-bit vector of [8 x i16] containing one of the source 
operands. The
    +///    horizontal differences between the values are stored in the upper 
bits of
    +///    the destination.
    +/// \returns A 128-bit vector of [8 x i16] containing the horizontal 
saturated
    +///    differences of both operands.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_hsubs_epi16(__m128i __a, __m128i __b)
    +{
    +    return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b);
    +}
    +
    +/// Horizontally subtracts the adjacent pairs of values contained in 2
    +///    packed 64-bit vectors of [4 x i16]. Positive differences greater 
than
    +///    0x7FFF are saturated to 0x7FFF. Negative differences less than 
0x8000 are
    +///    saturated to 0x8000.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the \c PHSUBSW instruction.
    +///
    +/// \param __a
    +///    A 64-bit vector of [4 x i16] containing one of the source operands. 
The
    +///    horizontal differences between the values are stored in the lower 
bits of
    +///    the destination.
    +/// \param __b
    +///    A 64-bit vector of [4 x i16] containing one of the source operands. 
The
    +///    horizontal differences between the values are stored in the upper 
bits of
    +///    the destination.
    +/// \returns A 64-bit vector of [4 x i16] containing the horizontal 
saturated
    +///    differences of both operands.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
    +_mm_hsubs_pi16(__m64 __a, __m64 __b)
    +{
    +    return (__m64)__builtin_ia32_phsubsw((__v4hi)__a, (__v4hi)__b);
    +}
    +
    +/// Multiplies corresponding pairs of packed 8-bit unsigned integer
    +///    values contained in the first source operand and packed 8-bit signed
    +///    integer values contained in the second source operand, adds pairs of
    +///    contiguous products with signed saturation, and writes the 16-bit 
sums to
    +///    the corresponding bits in the destination.
    +///
    +///    For example, bits [7:0] of both operands are multiplied, bits 
[15:8] of
    +///    both operands are multiplied, and the sum of both results is 
written to
    +///    bits [15:0] of the destination.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the \c VPMADDUBSW instruction.
    +///
    +/// \param __a
    +///    A 128-bit integer vector containing the first source operand.
    +/// \param __b
    +///    A 128-bit integer vector containing the second source operand.
    +/// \returns A 128-bit integer vector containing the sums of products of 
both
    +///    operands: \n
    +///    \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
    +///    \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
    +///    \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
    +///    \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7) \n
    +///    \a R4 := (\a __a8 * \a __b8) + (\a __a9 * \a __b9) \n
    +///    \a R5 := (\a __a10 * \a __b10) + (\a __a11 * \a __b11) \n
    +///    \a R6 := (\a __a12 * \a __b12) + (\a __a13 * \a __b13) \n
    +///    \a R7 := (\a __a14 * \a __b14) + (\a __a15 * \a __b15)
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_maddubs_epi16(__m128i __a, __m128i __b)
    +{
    +    return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, 
(__v16qi)__b);
    +}
    +
    +/// Multiplies corresponding pairs of packed 8-bit unsigned integer
    +///    values contained in the first source operand and packed 8-bit signed
    +///    integer values contained in the second source operand, adds pairs of
    +///    contiguous products with signed saturation, and writes the 16-bit 
sums to
    +///    the corresponding bits in the destination.
    +///
    +///    For example, bits [7:0] of both operands are multiplied, bits 
[15:8] of
    +///    both operands are multiplied, and the sum of both results is 
written to
    +///    bits [15:0] of the destination.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the \c PMADDUBSW instruction.
    +///
    +/// \param __a
    +///    A 64-bit integer vector containing the first source operand.
    +/// \param __b
    +///    A 64-bit integer vector containing the second source operand.
    +/// \returns A 64-bit integer vector containing the sums of products of 
both
    +///    operands: \n
    +///    \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
    +///    \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
    +///    \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
    +///    \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7)
    +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
    +_mm_maddubs_pi16(__m64 __a, __m64 __b)
    +{
    +    return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__a, (__v8qi)__b);
    +}
    +
    +/// Multiplies packed 16-bit signed integer values, truncates the 32-bit
    +///    products to the 18 most significant bits by right-shifting, rounds 
the
    +///    truncated value by adding 1, and writes bits [16:1] to the 
destination.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the \c VPMULHRSW instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [8 x i16] containing one of the source operands.
    +/// \param __b
    +///    A 128-bit vector of [8 x i16] containing one of the source operands.
    +/// \returns A 128-bit vector of [8 x i16] containing the rounded and 
scaled
    +///    products of both operands.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_mulhrs_epi16(__m128i __a, __m128i __b)
    +{
    +    return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
    +}
    +
    +/// Multiplies packed 16-bit signed integer values, truncates the 32-bit
    +///    products to the 18 most significant bits by right-shifting, rounds 
the
    +///    truncated value by adding 1, and writes bits [16:1] to the 
destination.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the \c PMULHRSW instruction.
    +///
    +/// \param __a
    +///    A 64-bit vector of [4 x i16] containing one of the source operands.
    +/// \param __b
    +///    A 64-bit vector of [4 x i16] containing one of the source operands.
    +/// \returns A 64-bit vector of [4 x i16] containing the rounded and scaled
    +///    products of both operands.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
    +_mm_mulhrs_pi16(__m64 __a, __m64 __b)
    +{
    +    return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__a, (__v4hi)__b);
    +}
    +
    +/// Copies the 8-bit integers from a 128-bit integer vector to the
    +///    destination or clears 8-bit values in the destination, as specified 
by
    +///    the second source operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the \c VPSHUFB instruction.
    +///
    +/// \param __a
    +///    A 128-bit integer vector containing the values to be copied.
    +/// \param __b
    +///    A 128-bit integer vector containing control bytes corresponding to
    +///    positions in the destination:
    +///    Bit 7: \n
    +///    1: Clear the corresponding byte in the destination. \n
    +///    0: Copy the selected source byte to the corresponding byte in the
    +///    destination. \n
    +///    Bits [6:4] Reserved.  \n
    +///    Bits [3:0] select the source byte to be copied.
    +/// \returns A 128-bit integer vector containing the copied or cleared 
values.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_shuffle_epi8(__m128i __a, __m128i __b)
    +{
    +    return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b);
    +}
    +
    +/// Copies the 8-bit integers from a 64-bit integer vector to the
    +///    destination or clears 8-bit values in the destination, as specified 
by
    +///    the second source operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the \c PSHUFB instruction.
    +///
    +/// \param __a
    +///    A 64-bit integer vector containing the values to be copied.
    +/// \param __b
    +///    A 64-bit integer vector containing control bytes corresponding to
    +///    positions in the destination:
    +///    Bit 7: \n
    +///    1: Clear the corresponding byte in the destination. \n
    +///    0: Copy the selected source byte to the corresponding byte in the
    +///    destination. \n
    +///    Bits [3:0] select the source byte to be copied.
    +/// \returns A 64-bit integer vector containing the copied or cleared 
values.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
    +_mm_shuffle_pi8(__m64 __a, __m64 __b)
    +{
    +    return (__m64)__builtin_ia32_pshufb((__v8qi)__a, (__v8qi)__b);
    +}
    +
    +/// For each 8-bit integer in the first source operand, perform one of
    +///    the following actions as specified by the second source operand.
    +///
    +///    If the byte in the second source is negative, calculate the two's
    +///    complement of the corresponding byte in the first source, and write 
that
    +///    value to the destination. If the byte in the second source is 
positive,
    +///    copy the corresponding byte from the first source to the 
destination. If
    +///    the byte in the second source is zero, clear the corresponding byte 
in
    +///    the destination.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the \c VPSIGNB instruction.
    +///
    +/// \param __a
    +///    A 128-bit integer vector containing the values to be copied.
    +/// \param __b
    +///    A 128-bit integer vector containing control bytes corresponding to
    +///    positions in the destination.
    +/// \returns A 128-bit integer vector containing the resultant values.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_sign_epi8(__m128i __a, __m128i __b)
    +{
    +    return (__m128i)__builtin_ia32_psignb128((__v16qi)__a, (__v16qi)__b);
    +}
    +
    +/// For each 16-bit integer in the first source operand, perform one of
    +///    the following actions as specified by the second source operand.
    +///
    +///    If the word in the second source is negative, calculate the two's
    +///    complement of the corresponding word in the first source, and write 
that
    +///    value to the destination. If the word in the second source is 
positive,
    +///    copy the corresponding word from the first source to the 
destination. If
    +///    the word in the second source is zero, clear the corresponding word 
in
    +///    the destination.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the \c VPSIGNW instruction.
    +///
    +/// \param __a
    +///    A 128-bit integer vector containing the values to be copied.
    +/// \param __b
    +///    A 128-bit integer vector containing control words corresponding to
    +///    positions in the destination.
    +/// \returns A 128-bit integer vector containing the resultant values.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_sign_epi16(__m128i __a, __m128i __b)
    +{
    +    return (__m128i)__builtin_ia32_psignw128((__v8hi)__a, (__v8hi)__b);
    +}
    +
    +/// For each 32-bit integer in the first source operand, perform one of
    +///    the following actions as specified by the second source operand.
    +///
    +///    If the doubleword in the second source is negative, calculate the 
two's
    +///    complement of the corresponding word in the first source, and write 
that
    +///    value to the destination. If the doubleword in the second source is
    +///    positive, copy the corresponding word from the first source to the
    +///    destination. If the doubleword in the second source is zero, clear 
the
    +///    corresponding word in the destination.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the \c VPSIGND instruction.
    +///
    +/// \param __a
    +///    A 128-bit integer vector containing the values to be copied.
    +/// \param __b
    +///    A 128-bit integer vector containing control doublewords 
corresponding to
    +///    positions in the destination.
    +/// \returns A 128-bit integer vector containing the resultant values.
    +static __inline__ __m128i __DEFAULT_FN_ATTRS
    +_mm_sign_epi32(__m128i __a, __m128i __b)
    +{
    +    return (__m128i)__builtin_ia32_psignd128((__v4si)__a, (__v4si)__b);
    +}
    +
    +/// For each 8-bit integer in the first source operand, perform one of
    +///    the following actions as specified by the second source operand.
    +///
    +///    If the byte in the second source is negative, calculate the two's
    +///    complement of the corresponding byte in the first source, and write 
that
    +///    value to the destination. If the byte in the second source is 
positive,
    +///    copy the corresponding byte from the first source to the 
destination. If
    +///    the byte in the second source is zero, clear the corresponding byte 
in
    +///    the destination.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the \c PSIGNB instruction.
    +///
    +/// \param __a
    +///    A 64-bit integer vector containing the values to be copied.
    +/// \param __b
    +///    A 64-bit integer vector containing control bytes corresponding to
    +///    positions in the destination.
    +/// \returns A 64-bit integer vector containing the resultant values.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
    +_mm_sign_pi8(__m64 __a, __m64 __b)
    +{
    +    return (__m64)__builtin_ia32_psignb((__v8qi)__a, (__v8qi)__b);
    +}
    +
    +/// For each 16-bit integer in the first source operand, perform one of
    +///    the following actions as specified by the second source operand.
    +///
    +///    If the word in the second source is negative, calculate the two's
    +///    complement of the corresponding word in the first source, and write 
that
    +///    value to the destination. If the word in the second source is 
positive,
    +///    copy the corresponding word from the first source to the 
destination. If
    +///    the word in the second source is zero, clear the corresponding word 
in
    +///    the destination.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the \c PSIGNW instruction.
    +///
    +/// \param __a
    +///    A 64-bit integer vector containing the values to be copied.
    +/// \param __b
    +///    A 64-bit integer vector containing control words corresponding to
    +///    positions in the destination.
    +/// \returns A 64-bit integer vector containing the resultant values.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
    +_mm_sign_pi16(__m64 __a, __m64 __b)
    +{
    +    return (__m64)__builtin_ia32_psignw((__v4hi)__a, (__v4hi)__b);
    +}
    +
    +/// For each 32-bit integer in the first source operand, perform one of
    +///    the following actions as specified by the second source operand.
    +///
    +///    If the doubleword in the second source is negative, calculate the 
two's
    +///    complement of the corresponding doubleword in the first source, and
    +///    write that value to the destination. If the doubleword in the second
    +///    source is positive, copy the corresponding doubleword from the first
    +///    source to the destination. If the doubleword in the second source is
    +///    zero, clear the corresponding doubleword in the destination.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the \c PSIGND instruction.
    +///
    +/// \param __a
    +///    A 64-bit integer vector containing the values to be copied.
    +/// \param __b
    +///    A 64-bit integer vector containing two control doublewords 
corresponding
    +///    to positions in the destination.
    +/// \returns A 64-bit integer vector containing the resultant values.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
    +_mm_sign_pi32(__m64 __a, __m64 __b)
    +{
    +    return (__m64)__builtin_ia32_psignd((__v2si)__a, (__v2si)__b);
    +}
    +
    +#undef __DEFAULT_FN_ATTRS
    +#undef __DEFAULT_FN_ATTRS_MMX
    +
    +#endif /* __TMMINTRIN_H */
    diff --git a/include/xmmintrin.h b/include/xmmintrin.h
    new file mode 100644
    index 0000000..e2543a7
    --- /dev/null
    +++ b/include/xmmintrin.h
    @@ -0,0 +1,3101 @@
    +/*===---- xmmintrin.h - SSE intrinsics 
-------------------------------------===
    + *
    + * Permission is hereby granted, free of charge, to any person obtaining a 
copy
    + * of this software and associated documentation files (the "Software"), 
to deal
    + * in the Software without restriction, including without limitation the 
rights
    + * to use, copy, modify, merge, publish, distribute, sublicense, and/or 
sell
    + * copies of the Software, and to permit persons to whom the Software is
    + * furnished to do so, subject to the following conditions:
    + *
    + * The above copyright notice and this permission notice shall be included 
in
    + * all copies or substantial portions of the Software.
    + *
    + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 
OR
    + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
THE
    + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
    + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 
IN
    + * THE SOFTWARE.
    + *
    + 
*===-----------------------------------------------------------------------===
    + */
    +
    +#ifndef __XMMINTRIN_H
    +#define __XMMINTRIN_H
    +
    +#include <mmintrin.h>
    +
    +typedef int __v4si __attribute__((__vector_size__(16)));
    +typedef float __v4sf __attribute__((__vector_size__(16)));
    +typedef float __m128 __attribute__((__vector_size__(16)));
    +
    +/* Unsigned types */
    +typedef unsigned int __v4su __attribute__((__vector_size__(16)));
    +
    +/* This header should only be included in a hosted environment as it 
depends on
    + * a standard library to provide allocation routines. */
    +#if __STDC_HOSTED__
    +#include <mm_malloc.h>
    +#endif
    +
    +/* Define the default attributes for the functions in this file. */
    +#ifdef  __GNUC__
    +#define __DEFAULT_FN_ATTRS __attribute__((__gnu_inline__, 
__always_inline__, __artificial__))
    +#define __DEFAULT_FN_ATTRS_MMX __attribute__((__gnu_inline__, 
__always_inline__, __artificial__))
    +#else
    +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, 
__target__("sse"), __min_vector_width__(128)))
    +#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, 
__nodebug__, __target__("mmx,sse"), __min_vector_width__(64)))
    +#endif
    +
    +#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | 
(w))
    +
    +
    +/// Adds the 32-bit float values in the low-order bits of the operands.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VADDSS / ADDSS </c> instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float] containing one of the source 
operands.
    +///    The lower 32 bits of this operand are used in the calculation.
    +/// \param __b
    +///    A 128-bit vector of [4 x float] containing one of the source 
operands.
    +///    The lower 32 bits of this operand are used in the calculation.
    +/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain 
the sum
    +///    of the lower 32 bits of both operands. The upper 96 bits are copied 
from
    +///    the upper 96 bits of the first source operand.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_add_ss(__m128 __a, __m128 __b)
    +{
    +  __a[0] += __b[0];
    +  return __a;
    +}
    +
    +/// Adds two 128-bit vectors of [4 x float], and returns the results of
    +///    the addition.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VADDPS / ADDPS </c> instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float] containing one of the source 
operands.
    +/// \param __b
    +///    A 128-bit vector of [4 x float] containing one of the source 
operands.
    +/// \returns A 128-bit vector of [4 x float] containing the sums of both
    +///    operands.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_add_ps(__m128 __a, __m128 __b)
    +{
    +  return (__m128)((__v4sf)__a + (__v4sf)__b);
    +}
    +
    +/// Subtracts the 32-bit float value in the low-order bits of the second
    +///    operand from the corresponding value in the first operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VSUBSS / SUBSS </c> instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float] containing the minuend. The lower 
32 bits
    +///    of this operand are used in the calculation.
    +/// \param __b
    +///    A 128-bit vector of [4 x float] containing the subtrahend. The 
lower 32
    +///    bits of this operand are used in the calculation.
    +/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain 
the
    +///    difference of the lower 32 bits of both operands. The upper 96 bits 
are
    +///    copied from the upper 96 bits of the first source operand.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_sub_ss(__m128 __a, __m128 __b)
    +{
    +  __a[0] -= __b[0];
    +  return __a;
    +}
    +
    +/// Subtracts each of the values of the second operand from the first
    +///    operand, both of which are 128-bit vectors of [4 x float] and 
returns
    +///    the results of the subtraction.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VSUBPS / SUBPS </c> instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float] containing the minuend.
    +/// \param __b
    +///    A 128-bit vector of [4 x float] containing the subtrahend.
    +/// \returns A 128-bit vector of [4 x float] containing the differences 
between
    +///    both operands.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_sub_ps(__m128 __a, __m128 __b)
    +{
    +  return (__m128)((__v4sf)__a - (__v4sf)__b);
    +}
    +
    +/// Multiplies two 32-bit float values in the low-order bits of the
    +///    operands.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMULSS / MULSS </c> instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float] containing one of the source 
operands.
    +///    The lower 32 bits of this operand are used in the calculation.
    +/// \param __b
    +///    A 128-bit vector of [4 x float] containing one of the source 
operands.
    +///    The lower 32 bits of this operand are used in the calculation.
    +/// \returns A 128-bit vector of [4 x float] containing the product of the 
lower
    +///    32 bits of both operands. The upper 96 bits are copied from the 
upper 96
    +///    bits of the first source operand.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_mul_ss(__m128 __a, __m128 __b)
    +{
    +  __a[0] *= __b[0];
    +  return __a;
    +}
    +
    +/// Multiplies two 128-bit vectors of [4 x float] and returns the
    +///    results of the multiplication.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMULPS / MULPS </c> instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float] containing one of the source 
operands.
    +/// \param __b
    +///    A 128-bit vector of [4 x float] containing one of the source 
operands.
    +/// \returns A 128-bit vector of [4 x float] containing the products of 
both
    +///    operands.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_mul_ps(__m128 __a, __m128 __b)
    +{
    +  return (__m128)((__v4sf)__a * (__v4sf)__b);
    +}
    +
    +/// Divides the value in the low-order 32 bits of the first operand by
    +///    the corresponding value in the second operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VDIVSS / DIVSS </c> instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float] containing the dividend. The lower 
32
    +///    bits of this operand are used in the calculation.
    +/// \param __b
    +///    A 128-bit vector of [4 x float] containing the divisor. The lower 
32 bits
    +///    of this operand are used in the calculation.
    +/// \returns A 128-bit vector of [4 x float] containing the quotients of 
the
    +///    lower 32 bits of both operands. The upper 96 bits are copied from 
the
    +///    upper 96 bits of the first source operand.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_div_ss(__m128 __a, __m128 __b)
    +{
    +  __a[0] /= __b[0];
    +  return __a;
    +}
    +
    +/// Divides two 128-bit vectors of [4 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VDIVPS / DIVPS </c> instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float] containing the dividend.
    +/// \param __b
    +///    A 128-bit vector of [4 x float] containing the divisor.
    +/// \returns A 128-bit vector of [4 x float] containing the quotients of 
both
    +///    operands.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_div_ps(__m128 __a, __m128 __b)
    +{
    +  return (__m128)((__v4sf)__a / (__v4sf)__b);
    +}
    +
    +/// Calculates the square root of the value stored in the low-order bits
    +///    of a 128-bit vector of [4 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VSQRTSS / SQRTSS </c> 
instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand 
are
    +///    used in the calculation.
    +/// \returns A 128-bit vector of [4 x float] containing the square root of 
the
    +///    value in the low-order bits of the operand.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_sqrt_ss(__m128 __a)
    +{
    +  return (__m128)__builtin_ia32_sqrtss((__v4sf)__a);
    +}
    +
    +/// Calculates the square roots of the values stored in a 128-bit vector
    +///    of [4 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VSQRTPS / SQRTPS </c> 
instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float].
    +/// \returns A 128-bit vector of [4 x float] containing the square roots 
of the
    +///    values in the operand.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_sqrt_ps(__m128 __a)
    +{
    +  return __builtin_ia32_sqrtps((__v4sf)__a);
    +}
    +
    +/// Calculates the approximate reciprocal of the value stored in the
    +///    low-order bits of a 128-bit vector of [4 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VRCPSS / RCPSS </c> instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand 
are
    +///    used in the calculation.
    +/// \returns A 128-bit vector of [4 x float] containing the approximate
    +///    reciprocal of the value in the low-order bits of the operand.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_rcp_ss(__m128 __a)
    +{
    +  return (__m128)__builtin_ia32_rcpss((__v4sf)__a);
    +}
    +
    +/// Calculates the approximate reciprocals of the values stored in a
    +///    128-bit vector of [4 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VRCPPS / RCPPS </c> instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float].
    +/// \returns A 128-bit vector of [4 x float] containing the approximate
    +///    reciprocals of the values in the operand.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_rcp_ps(__m128 __a)
    +{
    +  return (__m128)__builtin_ia32_rcpps((__v4sf)__a);
    +}
    +
    +/// Calculates the approximate reciprocal of the square root of the value
    +///    stored in the low-order bits of a 128-bit vector of [4 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VRSQRTSS / RSQRTSS </c> 
instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand 
are
    +///    used in the calculation.
    +/// \returns A 128-bit vector of [4 x float] containing the approximate
    +///    reciprocal of the square root of the value in the low-order bits of 
the
    +///    operand.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_rsqrt_ss(__m128 __a)
    +{
    +  return __builtin_ia32_rsqrtss((__v4sf)__a);
    +}
    +
    +/// Calculates the approximate reciprocals of the square roots of the
    +///    values stored in a 128-bit vector of [4 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VRSQRTPS / RSQRTPS </c> 
instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float].
    +/// \returns A 128-bit vector of [4 x float] containing the approximate
    +///    reciprocals of the square roots of the values in the operand.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_rsqrt_ps(__m128 __a)
    +{
    +  return __builtin_ia32_rsqrtps((__v4sf)__a);
    +}
    +
    +/// Compares two 32-bit float values in the low-order bits of both
    +///    operands and returns the lesser value in the low-order bits of the
    +///    vector of [4 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMINSS / MINSS </c> instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float] containing one of the operands. The 
lower
    +///    32 bits of this operand are used in the comparison.
    +/// \param __b
    +///    A 128-bit vector of [4 x float] containing one of the operands. The 
lower
    +///    32 bits of this operand are used in the comparison.
    +/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain 
the
    +///    minimum value between both operands. The upper 96 bits are copied 
from
    +///    the upper 96 bits of the first source operand.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_min_ss(__m128 __a, __m128 __b)
    +{
    +  return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b);
    +}
    +
    +/// Compares two 128-bit vectors of [4 x float] and returns the lesser
    +///    of each pair of values.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMINPS / MINPS </c> instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float] containing one of the operands.
    +/// \param __b
    +///    A 128-bit vector of [4 x float] containing one of the operands.
    +/// \returns A 128-bit vector of [4 x float] containing the minimum values
    +///    between both operands.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_min_ps(__m128 __a, __m128 __b)
    +{
    +  return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b);
    +}
    +
    +/// Compares two 32-bit float values in the low-order bits of both
    +///    operands and returns the greater value in the low-order bits of a 
128-bit
    +///    vector of [4 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMAXSS / MAXSS </c> instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float] containing one of the operands. The 
lower
    +///    32 bits of this operand are used in the comparison.
    +/// \param __b
    +///    A 128-bit vector of [4 x float] containing one of the operands. The 
lower
    +///    32 bits of this operand are used in the comparison.
    +/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain 
the
    +///    maximum value between both operands. The upper 96 bits are copied 
from
    +///    the upper 96 bits of the first source operand.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_max_ss(__m128 __a, __m128 __b)
    +{
    +  return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b);
    +}
    +
    +/// Compares two 128-bit vectors of [4 x float] and returns the greater
    +///    of each pair of values.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMAXPS / MAXPS </c> instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float] containing one of the operands.
    +/// \param __b
    +///    A 128-bit vector of [4 x float] containing one of the operands.
    +/// \returns A 128-bit vector of [4 x float] containing the maximum values
    +///    between both operands.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_max_ps(__m128 __a, __m128 __b)
    +{
    +  return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b);
    +}
    +
    +/// Performs a bitwise AND of two 128-bit vectors of [4 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VANDPS / ANDPS </c> instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector containing one of the source operands.
    +/// \param __b
    +///    A 128-bit vector containing one of the source operands.
    +/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of 
the
    +///    values between both operands.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_and_ps(__m128 __a, __m128 __b)
    +{
    +  return (__m128)((__v4su)__a & (__v4su)__b);
    +}
    +
    +/// Performs a bitwise AND of two 128-bit vectors of [4 x float], using
    +///    the one's complement of the values contained in the first source
    +///    operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VANDNPS / ANDNPS </c> 
instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float] containing the first source 
operand. The
    +///    one's complement of this value is used in the bitwise AND.
    +/// \param __b
    +///    A 128-bit vector of [4 x float] containing the second source 
operand.
    +/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of 
the
    +///    one's complement of the first operand and the values in the second
    +///    operand.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_andnot_ps(__m128 __a, __m128 __b)
    +{
    +  return (__m128)(~(__v4su)__a & (__v4su)__b);
    +}
    +
    +/// Performs a bitwise OR of two 128-bit vectors of [4 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VORPS / ORPS </c> instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float] containing one of the source 
operands.
    +/// \param __b
    +///    A 128-bit vector of [4 x float] containing one of the source 
operands.
    +/// \returns A 128-bit vector of [4 x float] containing the bitwise OR of 
the
    +///    values between both operands.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_or_ps(__m128 __a, __m128 __b)
    +{
    +  return (__m128)((__v4su)__a | (__v4su)__b);
    +}
    +
    +/// Performs a bitwise exclusive OR of two 128-bit vectors of
    +///    [4 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float] containing one of the source 
operands.
    +/// \param __b
    +///    A 128-bit vector of [4 x float] containing one of the source 
operands.
    +/// \returns A 128-bit vector of [4 x float] containing the bitwise 
exclusive OR
    +///    of the values between both operands.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_xor_ps(__m128 __a, __m128 __b)
    +{
    +  return (__m128)((__v4su)__a ^ (__v4su)__b);
    +}
    +
    +/// Compares two 32-bit float values in the low-order bits of both
    +///    operands for equality and returns the result of the comparison in 
the
    +///    low-order bits of a vector [4 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCMPEQSS / CMPEQSS </c> 
instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float] containing one of the operands. The 
lower
    +///    32 bits of this operand are used in the comparison.
    +/// \param __b
    +///    A 128-bit vector of [4 x float] containing one of the operands. The 
lower
    +///    32 bits of this operand are used in the comparison.
    +/// \returns A 128-bit vector of [4 x float] containing the comparison 
results
    +///    in the low-order bits.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_cmpeq_ss(__m128 __a, __m128 __b)
    +{
    +  return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b);
    +}
    +
    +/// Compares each of the corresponding 32-bit float values of the
    +///    128-bit vectors of [4 x float] for equality.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCMPEQPS / CMPEQPS </c> 
instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float].
    +/// \param __b
    +///    A 128-bit vector of [4 x float].
    +/// \returns A 128-bit vector of [4 x float] containing the comparison 
results.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_cmpeq_ps(__m128 __a, __m128 __b)
    +{
    +  return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b);
    +}
    +
    +/// Compares two 32-bit float values in the low-order bits of both
    +///    operands to determine if the value in the first operand is less 
than the
    +///    corresponding value in the second operand and returns the result of 
the
    +///    comparison in the low-order bits of a vector of [4 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> 
instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float] containing one of the operands. The 
lower
    +///    32 bits of this operand are used in the comparison.
    +/// \param __b
    +///    A 128-bit vector of [4 x float] containing one of the operands. The 
lower
    +///    32 bits of this operand are used in the comparison.
    +/// \returns A 128-bit vector of [4 x float] containing the comparison 
results
    +///    in the low-order bits.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_cmplt_ss(__m128 __a, __m128 __b)
    +{
    +  return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b);
    +}
    +
    +/// Compares each of the corresponding 32-bit float values of the
    +///    128-bit vectors of [4 x float] to determine if the values in the 
first
    +///    operand are less than those in the second operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> 
instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float].
    +/// \param __b
    +///    A 128-bit vector of [4 x float].
    +/// \returns A 128-bit vector of [4 x float] containing the comparison 
results.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_cmplt_ps(__m128 __a, __m128 __b)
    +{
    +  return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b);
    +}
    +
    +/// Compares two 32-bit float values in the low-order bits of both
    +///    operands to determine if the value in the first operand is less 
than or
    +///    equal to the corresponding value in the second operand and returns 
the
    +///    result of the comparison in the low-order bits of a vector of
    +///    [4 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> 
instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float] containing one of the operands. The 
lower
    +///    32 bits of this operand are used in the comparison.
    +/// \param __b
    +///    A 128-bit vector of [4 x float] containing one of the operands. The 
lower
    +///    32 bits of this operand are used in the comparison.
    +/// \returns A 128-bit vector of [4 x float] containing the comparison 
results
    +///    in the low-order bits.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_cmple_ss(__m128 __a, __m128 __b)
    +{
    +  return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b);
    +}
    +
    +/// Compares each of the corresponding 32-bit float values of the
    +///    128-bit vectors of [4 x float] to determine if the values in the 
first
    +///    operand are less than or equal to those in the second operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> 
instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float].
    +/// \param __b
    +///    A 128-bit vector of [4 x float].
    +/// \returns A 128-bit vector of [4 x float] containing the comparison 
results.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_cmple_ps(__m128 __a, __m128 __b)
    +{
    +  return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b);
    +}
    +
    +/// Compares two 32-bit float values in the low-order bits of both
    +///    operands to determine if the value in the first operand is greater 
than
    +///    the corresponding value in the second operand and returns the 
result of
    +///    the comparison in the low-order bits of a vector of [4 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> 
instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float] containing one of the operands. The 
lower
    +///    32 bits of this operand are used in the comparison.
    +/// \param __b
    +///    A 128-bit vector of [4 x float] containing one of the operands. The 
lower
    +///    32 bits of this operand are used in the comparison.
    +/// \returns A 128-bit vector of [4 x float] containing the comparison 
results
    +///    in the low-order bits.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_cmpgt_ss(__m128 __a, __m128 __b)
    +{
    +#ifdef __GNUC__
    +  return (__m128) __builtin_ia32_movss ((__v4sf) __a, (__v4sf)
    +           __builtin_ia32_cmpltss ((__v4sf) __b, (__v4sf) __a));
    +#else
    +  return (__m128)__builtin_shufflevector((__v4sf)__a,
    +                                         
(__v4sf)__builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a),
    +                                         4, 1, 2, 3);
    +#endif
    +}
    +
    +/// Compares each of the corresponding 32-bit float values of the
    +///    128-bit vectors of [4 x float] to determine if the values in the 
first
    +///    operand are greater than those in the second operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> 
instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float].
    +/// \param __b
    +///    A 128-bit vector of [4 x float].
    +/// \returns A 128-bit vector of [4 x float] containing the comparison 
results.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_cmpgt_ps(__m128 __a, __m128 __b)
    +{
    +  return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a);
    +}
    +
    +/// Compares two 32-bit float values in the low-order bits of both
    +///    operands to determine if the value in the first operand is greater 
than
    +///    or equal to the corresponding value in the second operand and 
returns
    +///    the result of the comparison in the low-order bits of a vector of
    +///    [4 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> 
instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float] containing one of the operands. The 
lower
    +///    32 bits of this operand are used in the comparison.
    +/// \param __b
    +///    A 128-bit vector of [4 x float] containing one of the operands. The 
lower
    +///    32 bits of this operand are used in the comparison.
    +/// \returns A 128-bit vector of [4 x float] containing the comparison 
results
    +///    in the low-order bits.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_cmpge_ss(__m128 __a, __m128 __b)
    +{
    +#ifdef __GNUC__
    +  return (__m128) __builtin_ia32_movss ((__v4sf) __a, (__v4sf)
    +           __builtin_ia32_cmpless ((__v4sf) __b, (__v4sf) __a));
    +#else
    +  return (__m128)__builtin_shufflevector((__v4sf)__a,
    +                                         
(__v4sf)__builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a),
    +                                         4, 1, 2, 3);
    +#endif
    +}
    +
    +/// Compares each of the corresponding 32-bit float values of the
    +///    128-bit vectors of [4 x float] to determine if the values in the 
first
    +///    operand are greater than or equal to those in the second operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> 
instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float].
    +/// \param __b
    +///    A 128-bit vector of [4 x float].
    +/// \returns A 128-bit vector of [4 x float] containing the comparison 
results.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_cmpge_ps(__m128 __a, __m128 __b)
    +{
    +  return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a);
    +}
    +
    +/// Compares two 32-bit float values in the low-order bits of both
    +///    operands for inequality and returns the result of the comparison in 
the
    +///    low-order bits of a vector of [4 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCMPNEQSS / CMPNEQSS </c>
    +///   instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float] containing one of the operands. The 
lower
    +///    32 bits of this operand are used in the comparison.
    +/// \param __b
    +///    A 128-bit vector of [4 x float] containing one of the operands. The 
lower
    +///    32 bits of this operand are used in the comparison.
    +/// \returns A 128-bit vector of [4 x float] containing the comparison 
results
    +///    in the low-order bits.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_cmpneq_ss(__m128 __a, __m128 __b)
    +{
    +  return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b);
    +}
    +
    +/// Compares each of the corresponding 32-bit float values of the
    +///    128-bit vectors of [4 x float] for inequality.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCMPNEQPS / CMPNEQPS </c>
    +///   instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float].
    +/// \param __b
    +///    A 128-bit vector of [4 x float].
    +/// \returns A 128-bit vector of [4 x float] containing the comparison 
results.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_cmpneq_ps(__m128 __a, __m128 __b)
    +{
    +  return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b);
    +}
    +
    +/// Compares two 32-bit float values in the low-order bits of both
    +///    operands to determine if the value in the first operand is not less 
than
    +///    the corresponding value in the second operand and returns the 
result of
    +///    the comparison in the low-order bits of a vector of [4 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
    +///   instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float] containing one of the operands. The 
lower
    +///    32 bits of this operand are used in the comparison.
    +/// \param __b
    +///    A 128-bit vector of [4 x float] containing one of the operands. The 
lower
    +///    32 bits of this operand are used in the comparison.
    +/// \returns A 128-bit vector of [4 x float] containing the comparison 
results
    +///    in the low-order bits.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_cmpnlt_ss(__m128 __a, __m128 __b)
    +{
    +  return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b);
    +}
    +
    +/// Compares each of the corresponding 32-bit float values of the
    +///    128-bit vectors of [4 x float] to determine if the values in the 
first
    +///    operand are not less than those in the second operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
    +///   instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float].
    +/// \param __b
    +///    A 128-bit vector of [4 x float].
    +/// \returns A 128-bit vector of [4 x float] containing the comparison 
results.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_cmpnlt_ps(__m128 __a, __m128 __b)
    +{
    +  return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b);
    +}
    +
    +/// Compares two 32-bit float values in the low-order bits of both
    +///    operands to determine if the value in the first operand is not less 
than
    +///    or equal to the corresponding value in the second operand and 
returns
    +///    the result of the comparison in the low-order bits of a vector of
    +///    [4 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
    +///   instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float] containing one of the operands. The 
lower
    +///    32 bits of this operand are used in the comparison.
    +/// \param __b
    +///    A 128-bit vector of [4 x float] containing one of the operands. The 
lower
    +///    32 bits of this operand are used in the comparison.
    +/// \returns A 128-bit vector of [4 x float] containing the comparison 
results
    +///    in the low-order bits.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_cmpnle_ss(__m128 __a, __m128 __b)
    +{
    +  return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b);
    +}
    +
    +/// Compares each of the corresponding 32-bit float values of the
    +///    128-bit vectors of [4 x float] to determine if the values in the 
first
    +///    operand are not less than or equal to those in the second operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
    +///   instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float].
    +/// \param __b
    +///    A 128-bit vector of [4 x float].
    +/// \returns A 128-bit vector of [4 x float] containing the comparison 
results.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_cmpnle_ps(__m128 __a, __m128 __b)
    +{
    +  return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b);
    +}
    +
    +/// Compares two 32-bit float values in the low-order bits of both
    +///    operands to determine if the value in the first operand is not 
greater
    +///    than the corresponding value in the second operand and returns the
    +///    result of the comparison in the low-order bits of a vector of
    +///    [4 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
    +///   instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float] containing one of the operands. The 
lower
    +///    32 bits of this operand are used in the comparison.
    +/// \param __b
    +///    A 128-bit vector of [4 x float] containing one of the operands. The 
lower
    +///    32 bits of this operand are used in the comparison.
    +/// \returns A 128-bit vector of [4 x float] containing the comparison 
results
    +///    in the low-order bits.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_cmpngt_ss(__m128 __a, __m128 __b)
    +{
    +
    +#ifdef __GNUC__
    +  return (__m128) __builtin_ia32_movss ((__v4sf) __a, (__v4sf)
    +           __builtin_ia32_cmpnltss ((__v4sf) __b, (__v4sf) __a));
    +#else
    +  return (__m128)__builtin_shufflevector((__v4sf)__a,
    +                                         
(__v4sf)__builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a),
    +                                         4, 1, 2, 3);
    +#endif
    +}
    +
    +/// Compares each of the corresponding 32-bit float values of the
    +///    128-bit vectors of [4 x float] to determine if the values in the 
first
    +///    operand are not greater than those in the second operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
    +///   instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float].
    +/// \param __b
    +///    A 128-bit vector of [4 x float].
    +/// \returns A 128-bit vector of [4 x float] containing the comparison 
results.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_cmpngt_ps(__m128 __a, __m128 __b)
    +{
    +  return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a);
    +}
    +
    +/// Compares two 32-bit float values in the low-order bits of both
    +///    operands to determine if the value in the first operand is not 
greater
    +///    than or equal to the corresponding value in the second operand and
    +///    returns the result of the comparison in the low-order bits of a 
vector
    +///    of [4 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
    +///   instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float] containing one of the operands. The 
lower
    +///    32 bits of this operand are used in the comparison.
    +/// \param __b
    +///    A 128-bit vector of [4 x float] containing one of the operands. The 
lower
    +///    32 bits of this operand are used in the comparison.
    +/// \returns A 128-bit vector of [4 x float] containing the comparison 
results
    +///    in the low-order bits.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_cmpnge_ss(__m128 __a, __m128 __b)
    +{
    +#ifdef __GNUC__
    +  return (__m128) __builtin_ia32_movss ((__v4sf) __a, (__v4sf)
    +           __builtin_ia32_cmpnless ((__v4sf) __b, (__v4sf) __a));
    +#else
    +  return (__m128)__builtin_shufflevector((__v4sf)__a,
    +                                         
(__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a),
    +                                         4, 1, 2, 3);
    +#endif
    +}
    +
    +/// Compares each of the corresponding 32-bit float values of the
    +///    128-bit vectors of [4 x float] to determine if the values in the 
first
    +///    operand are not greater than or equal to those in the second 
operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
    +///   instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float].
    +/// \param __b
    +///    A 128-bit vector of [4 x float].
    +/// \returns A 128-bit vector of [4 x float] containing the comparison 
results.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_cmpnge_ps(__m128 __a, __m128 __b)
    +{
    +  return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a);
    +}
    +
    +/// Compares two 32-bit float values in the low-order bits of both
    +///    operands to determine if the value in the first operand is ordered 
with
    +///    respect to the corresponding value in the second operand and 
returns the
    +///    result of the comparison in the low-order bits of a vector of
    +///    [4 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCMPORDSS / CMPORDSS </c>
    +///   instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float] containing one of the operands. The 
lower
    +///    32 bits of this operand are used in the comparison.
    +/// \param __b
    +///    A 128-bit vector of [4 x float] containing one of the operands. The 
lower
    +///    32 bits of this operand are used in the comparison.
    +/// \returns A 128-bit vector of [4 x float] containing the comparison 
results
    +///    in the low-order bits.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_cmpord_ss(__m128 __a, __m128 __b)
    +{
    +  return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b);
    +}
    +
    +/// Compares each of the corresponding 32-bit float values of the
    +///    128-bit vectors of [4 x float] to determine if the values in the 
first
    +///    operand are ordered with respect to those in the second operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCMPORDPS / CMPORDPS </c>
    +///   instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float].
    +/// \param __b
    +///    A 128-bit vector of [4 x float].
    +/// \returns A 128-bit vector of [4 x float] containing the comparison 
results.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_cmpord_ps(__m128 __a, __m128 __b)
    +{
    +  return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b);
    +}
    +
    +/// Compares two 32-bit float values in the low-order bits of both
    +///    operands to determine if the value in the first operand is unordered
    +///    with respect to the corresponding value in the second operand and
    +///    returns the result of the comparison in the low-order bits of a 
vector
    +///    of [4 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCMPUNORDSS / CMPUNORDSS </c>
    +///   instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float] containing one of the operands. The 
lower
    +///    32 bits of this operand are used in the comparison.
    +/// \param __b
    +///    A 128-bit vector of [4 x float] containing one of the operands. The 
lower
    +///    32 bits of this operand are used in the comparison.
    +/// \returns A 128-bit vector of [4 x float] containing the comparison 
results
    +///    in the low-order bits.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_cmpunord_ss(__m128 __a, __m128 __b)
    +{
    +  return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b);
    +}
    +
    +/// Compares each of the corresponding 32-bit float values of the
    +///    128-bit vectors of [4 x float] to determine if the values in the 
first
    +///    operand are unordered with respect to those in the second operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCMPUNORDPS / CMPUNORDPS </c>
    +///   instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float].
    +/// \param __b
    +///    A 128-bit vector of [4 x float].
    +/// \returns A 128-bit vector of [4 x float] containing the comparison 
results.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_cmpunord_ps(__m128 __a, __m128 __b)
    +{
    +  return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b);
    +}
    +
    +/// Compares two 32-bit float values in the low-order bits of both
    +///    operands for equality and returns the result of the comparison.
    +///
    +///    If either of the two lower 32-bit values is NaN, 0 is returned.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
    +///   instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand 
are
    +///    used in the comparison.
    +/// \param __b
    +///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand 
are
    +///    used in the comparison.
    +/// \returns An integer containing the comparison results. If either of the
    +///    two lower 32-bit values is NaN, 0 is returned.
    +static __inline__ int __DEFAULT_FN_ATTRS
    +_mm_comieq_ss(__m128 __a, __m128 __b)
    +{
    +  return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b);
    +}
    +
    +/// Compares two 32-bit float values in the low-order bits of both
    +///    operands to determine if the first operand is less than the second
    +///    operand and returns the result of the comparison.
    +///
    +///    If either of the two lower 32-bit values is NaN, 0 is returned.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
    +///   instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand 
are
    +///    used in the comparison.
    +/// \param __b
    +///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand 
are
    +///    used in the comparison.
    +/// \returns An integer containing the comparison results. If either of 
the two
    +///     lower 32-bit values is NaN, 0 is returned.
    +static __inline__ int __DEFAULT_FN_ATTRS
    +_mm_comilt_ss(__m128 __a, __m128 __b)
    +{
    +  return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b);
    +}
    +
    +/// Compares two 32-bit float values in the low-order bits of both
    +///    operands to determine if the first operand is less than or equal to 
the
    +///    second operand and returns the result of the comparison.
    +///
    +///    If either of the two lower 32-bit values is NaN, 0 is returned.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> 
instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand 
are
    +///    used in the comparison.
    +/// \param __b
    +///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand 
are
    +///    used in the comparison.
    +/// \returns An integer containing the comparison results. If either of 
the two
    +///     lower 32-bit values is NaN, 0 is returned.
    +static __inline__ int __DEFAULT_FN_ATTRS
    +_mm_comile_ss(__m128 __a, __m128 __b)
    +{
    +  return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b);
    +}
    +
    +/// Compares two 32-bit float values in the low-order bits of both
    +///    operands to determine if the first operand is greater than the 
second
    +///    operand and returns the result of the comparison.
    +///
    +///    If either of the two lower 32-bit values is NaN, 0 is returned.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> 
instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand 
are
    +///    used in the comparison.
    +/// \param __b
    +///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand 
are
    +///    used in the comparison.
    +/// \returns An integer containing the comparison results. If either of the
    +///     two lower 32-bit values is NaN, 0 is returned.
    +static __inline__ int __DEFAULT_FN_ATTRS
    +_mm_comigt_ss(__m128 __a, __m128 __b)
    +{
    +  return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b);
    +}
    +
    +/// Compares two 32-bit float values in the low-order bits of both
    +///    operands to determine if the first operand is greater than or equal 
to
    +///    the second operand and returns the result of the comparison.
    +///
    +///    If either of the two lower 32-bit values is NaN, 0 is returned.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> 
instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand 
are
    +///    used in the comparison.
    +/// \param __b
    +///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand 
are
    +///    used in the comparison.
    +/// \returns An integer containing the comparison results. If either of 
the two
    +///    lower 32-bit values is NaN, 0 is returned.
    +static __inline__ int __DEFAULT_FN_ATTRS
    +_mm_comige_ss(__m128 __a, __m128 __b)
    +{
    +  return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b);
    +}
    +
    +/// Compares two 32-bit float values in the low-order bits of both
    +///    operands to determine if the first operand is not equal to the 
second
    +///    operand and returns the result of the comparison.
    +///
    +///    If either of the two lower 32-bit values is NaN, 1 is returned.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> 
instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand 
are
    +///    used in the comparison.
    +/// \param __b
    +///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand 
are
    +///    used in the comparison.
    +/// \returns An integer containing the comparison results. If either of the
    +///     two lower 32-bit values is NaN, 1 is returned.
    +static __inline__ int __DEFAULT_FN_ATTRS
    +_mm_comineq_ss(__m128 __a, __m128 __b)
    +{
    +  return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b);
    +}
    +
    +/// Performs an unordered comparison of two 32-bit float values using
    +///    the low-order bits of both operands to determine equality and 
returns
    +///    the result of the comparison.
    +///
    +///    If either of the two lower 32-bit values is NaN, 0 is returned.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> 
instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand 
are
    +///    used in the comparison.
    +/// \param __b
    +///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand 
are
    +///    used in the comparison.
    +/// \returns An integer containing the comparison results. If either of 
the two
    +///     lower 32-bit values is NaN, 0 is returned.
    +static __inline__ int __DEFAULT_FN_ATTRS
    +_mm_ucomieq_ss(__m128 __a, __m128 __b)
    +{
    +  return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b);
    +}
    +
    +/// Performs an unordered comparison of two 32-bit float values using
    +///    the low-order bits of both operands to determine if the first 
operand is
    +///    less than the second operand and returns the result of the 
comparison.
    +///
    +///    If either of the two lower 32-bit values is NaN, 0 is returned.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> 
instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand 
are
    +///    used in the comparison.
    +/// \param __b
    +///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand 
are
    +///    used in the comparison.
    +/// \returns An integer containing the comparison results. If either of 
the two
    +///    lower 32-bit values is NaN, 0 is returned.
    +static __inline__ int __DEFAULT_FN_ATTRS
    +_mm_ucomilt_ss(__m128 __a, __m128 __b)
    +{
    +  return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b);
    +}
    +
    +/// Performs an unordered comparison of two 32-bit float values using
    +///    the low-order bits of both operands to determine if the first 
operand is
    +///    less than or equal to the second operand and returns the result of 
the
    +///    comparison.
    +///
    +///    If either of the two lower 32-bit values is NaN, 0 is returned.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> 
instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand 
are
    +///    used in the comparison.
    +/// \param __b
    +///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand 
are
    +///    used in the comparison.
    +/// \returns An integer containing the comparison results. If either of 
the two
    +///     lower 32-bit values is NaN, 0 is returned.
    +static __inline__ int __DEFAULT_FN_ATTRS
    +_mm_ucomile_ss(__m128 __a, __m128 __b)
    +{
    +  return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b);
    +}
    +
    +/// Performs an unordered comparison of two 32-bit float values using
    +///    the low-order bits of both operands to determine if the first 
operand is
    +///    greater than the second operand and returns the result of the
    +///    comparison.
    +///
    +///    If either of the two lower 32-bit values is NaN, 0 is returned.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> 
instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand 
are
    +///    used in the comparison.
    +/// \param __b
    +///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand 
are
    +///    used in the comparison.
    +/// \returns An integer containing the comparison results. If either of 
the two
    +///     lower 32-bit values is NaN, 0 is returned.
    +static __inline__ int __DEFAULT_FN_ATTRS
    +_mm_ucomigt_ss(__m128 __a, __m128 __b)
    +{
    +  return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b);
    +}
    +
    +/// Performs an unordered comparison of two 32-bit float values using
    +///    the low-order bits of both operands to determine if the first 
operand is
    +///    greater than or equal to the second operand and returns the result 
of
    +///    the comparison.
    +///
    +///    If either of the two lower 32-bit values is NaN, 0 is returned.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> 
instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand 
are
    +///    used in the comparison.
    +/// \param __b
    +///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand 
are
    +///    used in the comparison.
    +/// \returns An integer containing the comparison results. If either of 
the two
    +///     lower 32-bit values is NaN, 0 is returned.
    +static __inline__ int __DEFAULT_FN_ATTRS
    +_mm_ucomige_ss(__m128 __a, __m128 __b)
    +{
    +  return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b);
    +}
    +
    +/// Performs an unordered comparison of two 32-bit float values using
    +///    the low-order bits of both operands to determine inequality and 
returns
    +///    the result of the comparison.
    +///
    +///    If either of the two lower 32-bit values is NaN, 1 is returned.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> 
instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand 
are
    +///    used in the comparison.
    +/// \param __b
    +///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand 
are
    +///    used in the comparison.
    +/// \returns An integer containing the comparison results. If either of 
the two
    +///    lower 32-bit values is NaN, 1 is returned.
    +static __inline__ int __DEFAULT_FN_ATTRS
    +_mm_ucomineq_ss(__m128 __a, __m128 __b)
    +{
    +  return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b);
    +}
    +
    +/// Converts a float value contained in the lower 32 bits of a vector of
    +///    [4 x float] into a 32-bit integer.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
    +///   instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand 
are
    +///    used in the conversion.
    +/// \returns A 32-bit integer containing the converted value.
    +static __inline__ int __DEFAULT_FN_ATTRS
    +_mm_cvtss_si32(__m128 __a)
    +{
    +  return __builtin_ia32_cvtss2si((__v4sf)__a);
    +}
    +
    +/// Converts a float value contained in the lower 32 bits of a vector of
    +///    [4 x float] into a 32-bit integer.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
    +///   instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand 
are
    +///    used in the conversion.
    +/// \returns A 32-bit integer containing the converted value.
    +static __inline__ int __DEFAULT_FN_ATTRS
    +_mm_cvt_ss2si(__m128 __a)
    +{
    +  return _mm_cvtss_si32(__a);
    +}
    +
    +#ifdef __x86_64__
    +
    +/// Converts a float value contained in the lower 32 bits of a vector of
    +///    [4 x float] into a 64-bit integer.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
    +///   instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand 
are
    +///    used in the conversion.
    +/// \returns A 64-bit integer containing the converted value.
    +static __inline__ long long __DEFAULT_FN_ATTRS
    +_mm_cvtss_si64(__m128 __a)
    +{
    +  return __builtin_ia32_cvtss2si64((__v4sf)__a);
    +}
    +
    +#endif
    +
    +/// Converts two low-order float values in a 128-bit vector of
    +///    [4 x float] into a 64-bit vector of [2 x i32].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float].
    +/// \returns A 64-bit integer vector containing the converted values.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
    +_mm_cvtps_pi32(__m128 __a)
    +{
    +  return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a);
    +}
    +
    +/// Converts two low-order float values in a 128-bit vector of
    +///    [4 x float] into a 64-bit vector of [2 x i32].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float].
    +/// \returns A 64-bit integer vector containing the converted values.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
    +_mm_cvt_ps2pi(__m128 __a)
    +{
    +  return _mm_cvtps_pi32(__a);
    +}
    +
    +/// Converts a float value contained in the lower 32 bits of a vector of
    +///    [4 x float] into a 32-bit integer, truncating the result when it is
    +///    inexact.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
    +///   instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand 
are
    +///    used in the conversion.
    +/// \returns A 32-bit integer containing the converted value.
    +static __inline__ int __DEFAULT_FN_ATTRS
    +_mm_cvttss_si32(__m128 __a)
    +{
    +  return __builtin_ia32_cvttss2si((__v4sf)__a);
    +}
    +
    +/// Converts a float value contained in the lower 32 bits of a vector of
    +///    [4 x float] into a 32-bit integer, truncating the result when it is
    +///    inexact.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
    +///   instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand 
are
    +///    used in the conversion.
    +/// \returns A 32-bit integer containing the converted value.
    +static __inline__ int __DEFAULT_FN_ATTRS
    +_mm_cvtt_ss2si(__m128 __a)
    +{
    +  return _mm_cvttss_si32(__a);
    +}
    +
    +#ifdef __x86_64__
    +/// Converts a float value contained in the lower 32 bits of a vector of
    +///    [4 x float] into a 64-bit integer, truncating the result when it is
    +///    inexact.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
    +///   instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand 
are
    +///    used in the conversion.
    +/// \returns A 64-bit integer containing the converted value.
    +static __inline__ long long __DEFAULT_FN_ATTRS
    +_mm_cvttss_si64(__m128 __a)
    +{
    +  return __builtin_ia32_cvttss2si64((__v4sf)__a);
    +}
    +#endif
    +
    +/// Converts two low-order float values in a 128-bit vector of
    +///    [4 x float] into a 64-bit vector of [2 x i32], truncating the result
    +///    when it is inexact.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> CVTTPS2PI / VTTPS2PI </c>
    +///   instructions.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float].
    +/// \returns A 64-bit integer vector containing the converted values.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
    +_mm_cvttps_pi32(__m128 __a)
    +{
    +  return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a);
    +}
    +
    +/// Converts two low-order float values in a 128-bit vector of [4 x
    +///    float] into a 64-bit vector of [2 x i32], truncating the result 
when it
    +///    is inexact.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> CVTTPS2PI </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float].
    +/// \returns A 64-bit integer vector containing the converted values.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
    +_mm_cvtt_ps2pi(__m128 __a)
    +{
    +  return _mm_cvttps_pi32(__a);
    +}
    +
    +/// Converts a 32-bit signed integer value into a floating point value
    +///    and writes it to the lower 32 bits of the destination. The remaining
    +///    higher order elements of the destination vector are copied from the
    +///    corresponding elements in the first operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float].
    +/// \param __b
    +///    A 32-bit signed integer operand containing the value to be 
converted.
    +/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain 
the
    +///    converted value of the second operand. The upper 96 bits are copied 
from
    +///    the upper 96 bits of the first operand.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_cvtsi32_ss(__m128 __a, int __b)
    +{
    +  __a[0] = __b;
    +  return __a;
    +}
    +
    +/// Converts a 32-bit signed integer value into a floating point value
    +///    and writes it to the lower 32 bits of the destination. The remaining
    +///    higher order elements of the destination are copied from the
    +///    corresponding elements in the first operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float].
    +/// \param __b
    +///    A 32-bit signed integer operand containing the value to be 
converted.
    +/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain 
the
    +///    converted value of the second operand. The upper 96 bits are copied 
from
    +///    the upper 96 bits of the first operand.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_cvt_si2ss(__m128 __a, int __b)
    +{
    +  return _mm_cvtsi32_ss(__a, __b);
    +}
    +
    +#ifdef __x86_64__
    +
    +/// Converts a 64-bit signed integer value into a floating point value
    +///    and writes it to the lower 32 bits of the destination. The remaining
    +///    higher order elements of the destination are copied from the
    +///    corresponding elements in the first operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float].
    +/// \param __b
    +///    A 64-bit signed integer operand containing the value to be 
converted.
    +/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain 
the
    +///    converted value of the second operand. The upper 96 bits are copied 
from
    +///    the upper 96 bits of the first operand.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_cvtsi64_ss(__m128 __a, long long __b)
    +{
    +  __a[0] = __b;
    +  return __a;
    +}
    +
    +#endif
    +
    +/// Converts two elements of a 64-bit vector of [2 x i32] into two
    +///    floating point values and writes them to the lower 64-bits of the
    +///    destination. The remaining higher order elements of the destination 
are
    +///    copied from the corresponding elements in the first operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float].
    +/// \param __b
    +///    A 64-bit vector of [2 x i32]. The elements in this vector are 
converted
    +///    and written to the corresponding low-order elements in the 
destination.
    +/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain 
the
    +///    converted value of the second operand. The upper 64 bits are copied 
from
    +///    the upper 64 bits of the first operand.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
    +_mm_cvtpi32_ps(__m128 __a, __m64 __b)
    +{
    +  return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b);
    +}
    +
    +/// Converts two elements of a 64-bit vector of [2 x i32] into two
    +///    floating point values and writes them to the lower 64-bits of the
    +///    destination. The remaining higher order elements of the destination 
are
    +///    copied from the corresponding elements in the first operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float].
    +/// \param __b
    +///    A 64-bit vector of [2 x i32]. The elements in this vector are 
converted
    +///    and written to the corresponding low-order elements in the 
destination.
    +/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain 
the
    +///    converted value from the second operand. The upper 64 bits are 
copied
    +///    from the upper 64 bits of the first operand.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
    +_mm_cvt_pi2ps(__m128 __a, __m64 __b)
    +{
    +  return _mm_cvtpi32_ps(__a, __b);
    +}
    +
    +/// Extracts a float value contained in the lower 32 bits of a vector of
    +///    [4 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic has no corresponding instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand 
are
    +///    used in the extraction.
    +/// \returns A 32-bit float containing the extracted value.
    +static __inline__ float __DEFAULT_FN_ATTRS
    +_mm_cvtss_f32(__m128 __a)
    +{
    +  return __a[0];
    +}
    +
    +/// Loads two packed float values from the address \a __p into the
    +///     high-order bits of a 128-bit vector of [4 x float]. The low-order 
bits
    +///     are copied from the low-order bits of the first operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float]. Bits [63:0] are written to bits 
[63:0]
    +///    of the destination.
    +/// \param __p
    +///    A pointer to two packed float values. Bits [63:0] are written to 
bits
    +///    [127:64] of the destination.
    +/// \returns A 128-bit vector of [4 x float] containing the moved values.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_loadh_pi(__m128 __a, const __m64 *__p)
    +{
    +#ifdef __GNUC__
    +  return (__m128) __builtin_ia32_loadhps ((__v4sf)__a, (const __v2sf 
*)__p);
    +#else
    +  typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
    +  struct __mm_loadh_pi_struct {
    +    __mm_loadh_pi_v2f32 __u;
    +  } __attribute__((__packed__, __may_alias__));
    +  __mm_loadh_pi_v2f32 __b = ((struct __mm_loadh_pi_struct*)__p)->__u;
    +  __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
    +  return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
    +#endif
    +}
    +
    +/// Loads two packed float values from the address \a __p into the
    +///    low-order bits of a 128-bit vector of [4 x float]. The high-order 
bits
    +///    are copied from the high-order bits of the first operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float]. Bits [127:64] are written to bits
    +///    [127:64] of the destination.
    +/// \param __p
    +///    A pointer to two packed float values. Bits [63:0] are written to 
bits
    +///    [63:0] of the destination.
    +/// \returns A 128-bit vector of [4 x float] containing the moved values.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_loadl_pi(__m128 __a, const __m64 *__p)
    +{
    +#ifdef __GNUC__
    +  return (__m128) __builtin_ia32_loadlps ((__v4sf)__a, (const __v2sf 
*)__p);
    +#else
    +  typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
    +  struct __mm_loadl_pi_struct {
    +    __mm_loadl_pi_v2f32 __u;
    +  } __attribute__((__packed__, __may_alias__));
    +  __mm_loadl_pi_v2f32 __b = ((struct __mm_loadl_pi_struct*)__p)->__u;
    +  __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
    +  return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
    +#endif
    +}
    +
    +/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
    +///    32 bits of the vector are initialized with the single-precision
    +///    floating-point value loaded from a specified memory location. The 
upper
    +///    96 bits are set to zero.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
    +///
    +/// \param __p
    +///    A pointer to a 32-bit memory location containing a single-precision
    +///    floating-point value.
    +/// \returns An initialized 128-bit floating-point vector of [4 x float]. 
The
    +///    lower 32 bits contain the value loaded from the memory location. The
    +///    upper 96 bits are set to zero.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_load_ss(const float *__p)
    +{
    +  struct __mm_load_ss_struct {
    +    float __u;
    +  } __attribute__((__packed__, __may_alias__));
    +  float __u = ((struct __mm_load_ss_struct*)__p)->__u;
    +  return __extension__ (__m128){ __u, 0, 0, 0 };
    +}
    +
    +/// Loads a 32-bit float value and duplicates it to all four vector
    +///    elements of a 128-bit vector of [4 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VBROADCASTSS / MOVSS + shuffling 
</c>
    +///    instruction.
    +///
    +/// \param __p
    +///    A pointer to a float value to be loaded and duplicated.
    +/// \returns A 128-bit vector of [4 x float] containing the loaded and
    +///    duplicated values.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_load1_ps(const float *__p)
    +{
    +  struct __mm_load1_ps_struct {
    +    float __u;
    +  } __attribute__((__packed__, __may_alias__));
    +  float __u = ((struct __mm_load1_ps_struct*)__p)->__u;
    +  return __extension__ (__m128){ __u, __u, __u, __u };
    +}
    +
    +#define        _mm_load_ps1(p) _mm_load1_ps(p)
    +
    +/// Loads a 128-bit floating-point vector of [4 x float] from an aligned
    +///    memory location.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> 
instruction.
    +///
    +/// \param __p
    +///    A pointer to a 128-bit memory location. The address of the memory
    +///    location has to be 128-bit aligned.
    +/// \returns A 128-bit vector of [4 x float] containing the loaded values.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_load_ps(const float *__p)
    +{
    +  return *(__m128*)__p;
    +}
    +
    +/// Loads a 128-bit floating-point vector of [4 x float] from an
    +///    unaligned memory location.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> 
instruction.
    +///
    +/// \param __p
    +///    A pointer to a 128-bit memory location. The address of the memory
    +///    location does not have to be aligned.
    +/// \returns A 128-bit vector of [4 x float] containing the loaded values.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_loadu_ps(const float *__p)
    +{
    +  struct __loadu_ps {
    +    __m128 __v;
    +  } __attribute__((__packed__, __may_alias__));
    +  return ((struct __loadu_ps*)__p)->__v;
    +}
    +
    +/// Loads four packed float values, in reverse order, from an aligned
    +///    memory location to 32-bit elements in a 128-bit vector of [4 x 
float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
    +///    instruction.
    +///
    +/// \param __p
    +///    A pointer to a 128-bit memory location. The address of the memory
    +///    location has to be 128-bit aligned.
    +/// \returns A 128-bit vector of [4 x float] containing the moved values, 
loaded
    +///    in reverse order.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_loadr_ps(const float *__p)
    +{
    +#ifdef __GNUC__
    +  return (__m128) __builtin_ia32_shufps (*(__v4sf *)__p, *(__v4sf *)__p, 
_MM_SHUFFLE (0,1,2,3));
    +#else
    +  __m128 __a = _mm_load_ps(__p);
    +  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
    +#endif
    +}
    +
    +/// Create a 128-bit vector of [4 x float] with undefined values.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic has no corresponding instruction.
    +///
    +/// \returns A 128-bit vector of [4 x float] containing undefined values.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_undefined_ps(void)
    +{
    +#ifdef __GNUC__
    +  __m128 __X = __X;
    +  return __X;
    +#else
    +  return (__m128)__builtin_ia32_undef128();
    +#endif
    +}
    +
    +/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
    +///    32 bits of the vector are initialized with the specified 
single-precision
    +///    floating-point value. The upper 96 bits are set to zero.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
    +///
    +/// \param __w
    +///    A single-precision floating-point value used to initialize the 
lower 32
    +///    bits of the result.
    +/// \returns An initialized 128-bit floating-point vector of [4 x float]. 
The
    +///    lower 32 bits contain the value provided in the source operand. The
    +///    upper 96 bits are set to zero.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_set_ss(float __w)
    +{
    +  return __extension__ (__m128){ __w, 0, 0, 0 };
    +}
    +
    +/// Constructs a 128-bit floating-point vector of [4 x float], with each
    +///    of the four single-precision floating-point vector elements set to 
the
    +///    specified single-precision floating-point value.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> 
instruction.
    +///
    +/// \param __w
    +///    A single-precision floating-point value used to initialize each 
vector
    +///    element of the result.
    +/// \returns An initialized 128-bit floating-point vector of [4 x float].
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_set1_ps(float __w)
    +{
    +  return __extension__ (__m128){ __w, __w, __w, __w };
    +}
    +
    +/* Microsoft specific. */
    +/// Constructs a 128-bit floating-point vector of [4 x float], with each
    +///    of the four single-precision floating-point vector elements set to 
the
    +///    specified single-precision floating-point value.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> 
instruction.
    +///
    +/// \param __w
    +///    A single-precision floating-point value used to initialize each 
vector
    +///    element of the result.
    +/// \returns An initialized 128-bit floating-point vector of [4 x float].
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_set_ps1(float __w)
    +{
    +    return _mm_set1_ps(__w);
    +}
    +
    +/// Constructs a 128-bit floating-point vector of [4 x float]
    +///    initialized with the specified single-precision floating-point 
values.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic is a utility function and does not correspond to a 
specific
    +///    instruction.
    +///
    +/// \param __z
    +///    A single-precision floating-point value used to initialize bits 
[127:96]
    +///    of the result.
    +/// \param __y
    +///    A single-precision floating-point value used to initialize bits 
[95:64]
    +///    of the result.
    +/// \param __x
    +///    A single-precision floating-point value used to initialize bits 
[63:32]
    +///    of the result.
    +/// \param __w
    +///    A single-precision floating-point value used to initialize bits 
[31:0]
    +///    of the result.
    +/// \returns An initialized 128-bit floating-point vector of [4 x float].
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_set_ps(float __z, float __y, float __x, float __w)
    +{
    +  return __extension__ (__m128){ __w, __x, __y, __z };
    +}
    +
    +/// Constructs a 128-bit floating-point vector of [4 x float],
    +///    initialized in reverse order with the specified 32-bit 
single-precision
    +///    float-point values.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic is a utility function and does not correspond to a 
specific
    +///    instruction.
    +///
    +/// \param __z
    +///    A single-precision floating-point value used to initialize bits 
[31:0]
    +///    of the result.
    +/// \param __y
    +///    A single-precision floating-point value used to initialize bits 
[63:32]
    +///    of the result.
    +/// \param __x
    +///    A single-precision floating-point value used to initialize bits 
[95:64]
    +///    of the result.
    +/// \param __w
    +///    A single-precision floating-point value used to initialize bits 
[127:96]
    +///    of the result.
    +/// \returns An initialized 128-bit floating-point vector of [4 x float].
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_setr_ps(float __z, float __y, float __x, float __w)
    +{
    +  return __extension__ (__m128){ __z, __y, __x, __w };
    +}
    +
    +/// Constructs a 128-bit floating-point vector of [4 x float] initialized
    +///    to zero.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
    +///
    +/// \returns An initialized 128-bit floating-point vector of [4 x float] 
with
    +///    all elements set to zero.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_setzero_ps(void)
    +{
    +  return __extension__ (__m128){ 0, 0, 0, 0 };
    +}
    +
    +/// Stores the upper 64 bits of a 128-bit vector of [4 x float] to a
    +///    memory location.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> 
instruction.
    +///
    +/// \param __p
    +///    A pointer to a 64-bit memory location.
    +/// \param __a
    +///    A 128-bit vector of [4 x float] containing the values to be stored.
    +static __inline__ void __DEFAULT_FN_ATTRS
    +_mm_storeh_pi(__m64 *__p, __m128 __a)
    +{
    +#ifdef __GNUC__
    +  __builtin_ia32_storehps((__v2sf *)__p, (__v4sf)__a);
    +#else
    +  __builtin_ia32_storehps((__v2si *)__p, (__v4sf)__a);
    +#endif
    +}
    +
    +/// Stores the lower 64 bits of a 128-bit vector of [4 x float] to a
    +///     memory location.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> 
instruction.
    +///
    +/// \param __p
    +///    A pointer to a memory location that will receive the float values.
    +/// \param __a
    +///    A 128-bit vector of [4 x float] containing the values to be stored.
    +static __inline__ void __DEFAULT_FN_ATTRS
    +_mm_storel_pi(__m64 *__p, __m128 __a)
    +{
    +#ifdef __GNUC__
    +  __builtin_ia32_storelps ((__v2sf *)__p, (__v4sf)__a);
    +#else
    +  __builtin_ia32_storelps((__v2si *)__p, (__v4sf)__a);
    +#endif
    +}
    +
    +/// Stores the lower 32 bits of a 128-bit vector of [4 x float] to a
    +///     memory location.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
    +///
    +/// \param __p
    +///    A pointer to a 32-bit memory location.
    +/// \param __a
    +///    A 128-bit vector of [4 x float] containing the value to be stored.
    +static __inline__ void __DEFAULT_FN_ATTRS
    +_mm_store_ss(float *__p, __m128 __a)
    +{
    +  struct __mm_store_ss_struct {
    +    float __u;
    +  } __attribute__((__packed__, __may_alias__));
    +  ((struct __mm_store_ss_struct*)__p)->__u = __a[0];
    +}
    +
    +/// Stores a 128-bit vector of [4 x float] to an unaligned memory
    +///    location.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> 
instruction.
    +///
    +/// \param __p
    +///    A pointer to a 128-bit memory location. The address of the memory
    +///    location does not have to be aligned.
    +/// \param __a
    +///    A 128-bit vector of [4 x float] containing the values to be stored.
    +static __inline__ void __DEFAULT_FN_ATTRS
    +_mm_storeu_ps(float *__p, __m128 __a)
    +{
    +  struct __storeu_ps {
    +    __m128 __v;
    +  } __attribute__((__packed__, __may_alias__));
    +  ((struct __storeu_ps*)__p)->__v = __a;
    +}
    +
    +/// Stores a 128-bit vector of [4 x float] into an aligned memory
    +///    location.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> 
instruction.
    +///
    +/// \param __p
    +///    A pointer to a 128-bit memory location. The address of the memory
    +///    location has to be 16-byte aligned.
    +/// \param __a
    +///    A 128-bit vector of [4 x float] containing the values to be stored.
    +static __inline__ void __DEFAULT_FN_ATTRS
    +_mm_store_ps(float *__p, __m128 __a)
    +{
    +  *(__m128*)__p = __a;
    +}
    +
    +/// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
    +///    four contiguous elements in an aligned memory location.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
    +///    instruction.
    +///
    +/// \param __p
    +///    A pointer to a 128-bit memory location.
    +/// \param __a
    +///    A 128-bit vector of [4 x float] whose lower 32 bits are stored to 
each
    +///    of the four contiguous elements pointed by \a __p.
    +static __inline__ void __DEFAULT_FN_ATTRS
    +_mm_store1_ps(float *__p, __m128 __a)
    +{
    +#ifdef __GNUC__
    +  __a = (__m128)__builtin_ia32_shufps((__v4sf)__a, (__v4sf)__a, 
_MM_SHUFFLE (0,0,0,0));
    +#else
    +  __a = (__m128)__builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 
0);
    +#endif
    +  _mm_store_ps(__p, __a);
    +}
    +
    +/// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
    +///    four contiguous elements in an aligned memory location.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
    +///    instruction.
    +///
    +/// \param __p
    +///    A pointer to a 128-bit memory location.
    +/// \param __a
    +///    A 128-bit vector of [4 x float] whose lower 32 bits are stored to 
each
    +///    of the four contiguous elements pointed by \a __p.
    +static __inline__ void __DEFAULT_FN_ATTRS
    +_mm_store_ps1(float *__p, __m128 __a)
    +{
    +  _mm_store1_ps(__p, __a);
    +}
    +
    +/// Stores float values from a 128-bit vector of [4 x float] to an
    +///    aligned memory location in reverse order.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
    +///    instruction.
    +///
    +/// \param __p
    +///    A pointer to a 128-bit memory location. The address of the memory
    +///    location has to be 128-bit aligned.
    +/// \param __a
    +///    A 128-bit vector of [4 x float] containing the values to be stored.
    +static __inline__ void __DEFAULT_FN_ATTRS
    +_mm_storer_ps(float *__p, __m128 __a)
    +{
    +#ifdef __GNUC__
    +  __a = __builtin_ia32_shufps ((__v4sf)__a, (__v4sf)__a, _MM_SHUFFLE 
(0,1,2,3));
    +#else
    +  __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
    +#endif
    +  _mm_store_ps(__p, __a);
    +}
    +
    +#define _MM_HINT_ET0 7
    +#define _MM_HINT_ET1 6
    +#define _MM_HINT_T0  3
    +#define _MM_HINT_T1  2
    +#define _MM_HINT_T2  1
    +#define _MM_HINT_NTA 0
    +
    +#ifndef _MSC_VER
    +/* FIXME: We have to #define this because "sel" must be a constant 
integer, and
    +   Sema doesn't do any form of constant propagation yet. */
    +
    +/// Loads one cache line of data from the specified address to a location
    +///    closer to the processor.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// void _mm_prefetch(const void * a, const int sel);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> PREFETCHNTA </c> instruction.
    +///
    +/// \param a
    +///    A pointer to a memory location containing a cache line of data.
    +/// \param sel
    +///    A predefined integer constant specifying the type of prefetch
    +///    operation: \n
    +///    _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint. 
The
    +///    PREFETCHNTA instruction will be generated. \n
    +///    _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 
instruction will
    +///    be generated. \n
    +///    _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 
instruction will
    +///    be generated. \n
    +///    _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 
instruction will
    +///    be generated.
    +#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), \
    +                                                 ((sel) >> 2) & 1, (sel) & 
0x3))
    +#endif
    +
    +/// Stores a 64-bit integer in the specified aligned memory location. To
    +///    minimize caching, the data is flagged as non-temporal (unlikely to 
be
    +///    used again soon).
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> MOVNTQ </c> instruction.
    +///
    +/// \param __p
    +///    A pointer to an aligned memory location used to store the register 
value.
    +/// \param __a
    +///    A 64-bit integer containing the value to be stored.
    +static __inline__ void __DEFAULT_FN_ATTRS_MMX
    +_mm_stream_pi(__m64 *__p, __m64 __a)
    +{
    +#ifdef __GNUC__
    +  __builtin_ia32_movntq ((unsigned long long *)__p, (unsigned long 
long)__a);
    +#else
    +  __builtin_ia32_movntq(__p, __a);
    +#endif
    +}
    +
    +/// Moves packed float values from a 128-bit vector of [4 x float] to a
    +///    128-bit aligned memory location. To minimize caching, the data is 
flagged
    +///    as non-temporal (unlikely to be used again soon).
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> 
instruction.
    +///
    +/// \param __p
    +///    A pointer to a 128-bit aligned memory location that will receive the
    +///    single-precision floating-point values.
    +/// \param __a
    +///    A 128-bit vector of [4 x float] containing the values to be moved.
    +static __inline__ void __DEFAULT_FN_ATTRS
    +_mm_stream_ps(float *__p, __m128 __a)
    +{
    +#ifdef __GNUC__
    +  __builtin_ia32_movntps (__p, (__v4sf)__a);
    +#else
    +  __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
    +#endif
    +}
    +
    +#if defined(__cplusplus)
    +extern "C" {
    +#endif
    +
    +/// Forces strong memory ordering (serialization) between store
    +///    instructions preceding this instruction and store instructions 
following
    +///    this instruction, ensuring the system completes all previous stores
    +///    before executing subsequent stores.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> SFENCE </c> instruction.
    +///
    +void _mm_sfence(void);
    +
    +#if defined(__cplusplus)
    +} // extern "C"
    +#endif
    +
    +/// Extracts 16-bit element from a 64-bit vector of [4 x i16] and
    +///    returns it, as specified by the immediate integer operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// int _mm_extract_pi16(__m64 a, int n);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> 
instruction.
    +///
    +/// \param a
    +///    A 64-bit vector of [4 x i16].
    +/// \param n
    +///    An immediate integer operand that determines which bits are 
extracted: \n
    +///    0: Bits [15:0] are copied to the destination. \n
    +///    1: Bits [31:16] are copied to the destination. \n
    +///    2: Bits [47:32] are copied to the destination. \n
    +///    3: Bits [63:48] are copied to the destination.
    +/// \returns A 16-bit integer containing the extracted 16 bits of packed 
data.
    +#define _mm_extract_pi16(a, n) \
    +  (int)__builtin_ia32_vec_ext_v4hi((__m64)a, (int)n)
    +
    +/// Copies data from the 64-bit vector of [4 x i16] to the destination,
    +///    and inserts the lower 16-bits of an integer operand at the 16-bit 
offset
    +///    specified by the immediate operand \a n.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m64 _mm_insert_pi16(__m64 a, int d, int n);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> PINSRW </c> instruction.
    +///
    +/// \param a
    +///    A 64-bit vector of [4 x i16].
    +/// \param d
    +///    An integer. The lower 16-bit value from this operand is written to 
the
    +///    destination at the offset specified by operand \a n.
    +/// \param n
    +///    An immediate integer operant that determines which the bits to be 
used
    +///    in the destination. \n
    +///    0: Bits [15:0] are copied to the destination. \n
    +///    1: Bits [31:16] are copied to the destination. \n
    +///    2: Bits [47:32] are copied to the destination. \n
    +///    3: Bits [63:48] are copied to the destination.  \n
    +///    The remaining bits in the destination are copied from the 
corresponding
    +///    bits in operand \a a.
    +/// \returns A 64-bit integer vector containing the copied packed data 
from the
    +///    operands.
    +#define _mm_insert_pi16(a, d, n) \
    +  (__m64)__builtin_ia32_vec_set_v4hi((__m64)a, (int)d, (int)n)
    +
    +/// Compares each of the corresponding packed 16-bit integer values of
    +///    the 64-bit integer vectors, and writes the greater value to the
    +///    corresponding bits in the destination.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PMAXSW </c> instruction.
    +///
    +/// \param __a
    +///    A 64-bit integer vector containing one of the source operands.
    +/// \param __b
    +///    A 64-bit integer vector containing one of the source operands.
    +/// \returns A 64-bit integer vector containing the comparison results.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
    +_mm_max_pi16(__m64 __a, __m64 __b)
    +{
    +  return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
    +}
    +
    +/// Compares each of the corresponding packed 8-bit unsigned integer
    +///    values of the 64-bit integer vectors, and writes the greater value 
to the
    +///    corresponding bits in the destination.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PMAXUB </c> instruction.
    +///
    +/// \param __a
    +///    A 64-bit integer vector containing one of the source operands.
    +/// \param __b
    +///    A 64-bit integer vector containing one of the source operands.
    +/// \returns A 64-bit integer vector containing the comparison results.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
    +_mm_max_pu8(__m64 __a, __m64 __b)
    +{
    +  return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
    +}
    +
    +/// Compares each of the corresponding packed 16-bit integer values of
    +///    the 64-bit integer vectors, and writes the lesser value to the
    +///    corresponding bits in the destination.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PMINSW </c> instruction.
    +///
    +/// \param __a
    +///    A 64-bit integer vector containing one of the source operands.
    +/// \param __b
    +///    A 64-bit integer vector containing one of the source operands.
    +/// \returns A 64-bit integer vector containing the comparison results.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
    +_mm_min_pi16(__m64 __a, __m64 __b)
    +{
    +  return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
    +}
    +
    +/// Compares each of the corresponding packed 8-bit unsigned integer
    +///    values of the 64-bit integer vectors, and writes the lesser value 
to the
    +///    corresponding bits in the destination.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PMINUB </c> instruction.
    +///
    +/// \param __a
    +///    A 64-bit integer vector containing one of the source operands.
    +/// \param __b
    +///    A 64-bit integer vector containing one of the source operands.
    +/// \returns A 64-bit integer vector containing the comparison results.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
    +_mm_min_pu8(__m64 __a, __m64 __b)
    +{
    +  return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
    +}
    +
    +/// Takes the most significant bit from each 8-bit element in a 64-bit
    +///    integer vector to create an 8-bit mask value. Zero-extends the 
value to
    +///    32-bit integer and writes it to the destination.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PMOVMSKB </c> instruction.
    +///
    +/// \param __a
    +///    A 64-bit integer vector containing the values with bits to be 
extracted.
    +/// \returns The most significant bit from each 8-bit element in \a __a,
    +///    written to bits [7:0].
    +static __inline__ int __DEFAULT_FN_ATTRS_MMX
    +_mm_movemask_pi8(__m64 __a)
    +{
    +  return __builtin_ia32_pmovmskb((__v8qi)__a);
    +}
    +
    +/// Multiplies packed 16-bit unsigned integer values and writes the
    +///    high-order 16 bits of each 32-bit product to the corresponding bits 
in
    +///    the destination.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PMULHUW </c> instruction.
    +///
    +/// \param __a
    +///    A 64-bit integer vector containing one of the source operands.
    +/// \param __b
    +///    A 64-bit integer vector containing one of the source operands.
    +/// \returns A 64-bit integer vector containing the products of both 
operands.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
    +_mm_mulhi_pu16(__m64 __a, __m64 __b)
    +{
    +  return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
    +}
    +
    +/// Shuffles the 4 16-bit integers from a 64-bit integer vector to the
    +///    destination, as specified by the immediate value operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m64 _mm_shuffle_pi16(__m64 a, const int n);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> PSHUFW </c> instruction.
    +///
    +/// \param a
    +///    A 64-bit integer vector containing the values to be shuffled.
    +/// \param n
    +///    An immediate value containing an 8-bit value specifying which 
elements to
    +///    copy from \a a. The destinations within the 64-bit destination are
    +///    assigned values as follows: \n
    +///    Bits [1:0] are used to assign values to bits [15:0] in the
    +///    destination. \n
    +///    Bits [3:2] are used to assign values to bits [31:16] in the
    +///    destination. \n
    +///    Bits [5:4] are used to assign values to bits [47:32] in the
    +///    destination. \n
    +///    Bits [7:6] are used to assign values to bits [63:48] in the
    +///    destination. \n
    +///    Bit value assignments: \n
    +///    00: assigned from bits [15:0] of \a a. \n
    +///    01: assigned from bits [31:16] of \a a. \n
    +///    10: assigned from bits [47:32] of \a a. \n
    +///    11: assigned from bits [63:48] of \a a.
    +/// \returns A 64-bit integer vector containing the shuffled values.
    +#define _mm_shuffle_pi16(a, n) \
    +  (__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n))
    +
    +/// Conditionally copies the values from each 8-bit element in the first
    +///    64-bit integer vector operand to the specified memory location, as
    +///    specified by the most significant bit in the corresponding element 
in the
    +///    second 64-bit integer vector operand.
    +///
    +///    To minimize caching, the data is flagged as non-temporal
    +///    (unlikely to be used again soon).
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> MASKMOVQ </c> instruction.
    +///
    +/// \param __d
    +///    A 64-bit integer vector containing the values with elements to be 
copied.
    +/// \param __n
    +///    A 64-bit integer vector operand. The most significant bit from each 
8-bit
    +///    element determines whether the corresponding element in operand \a 
__d
    +///    is copied. If the most significant bit of a given element is 1, the
    +///    corresponding element in operand \a __d is copied.
    +/// \param __p
    +///    A pointer to a 64-bit memory location that will receive the 
conditionally
    +///    copied integer values. The address of the memory location does not 
have
    +///    to be aligned.
    +static __inline__ void __DEFAULT_FN_ATTRS_MMX
    +_mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
    +{
    +  __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
    +}
    +
    +/// Computes the rounded averages of the packed unsigned 8-bit integer
    +///    values and writes the averages to the corresponding bits in the
    +///    destination.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PAVGB </c> instruction.
    +///
    +/// \param __a
    +///    A 64-bit integer vector containing one of the source operands.
    +/// \param __b
    +///    A 64-bit integer vector containing one of the source operands.
    +/// \returns A 64-bit integer vector containing the averages of both 
operands.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
    +_mm_avg_pu8(__m64 __a, __m64 __b)
    +{
    +  return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
    +}
    +
    +/// Computes the rounded averages of the packed unsigned 16-bit integer
    +///    values and writes the averages to the corresponding bits in the
    +///    destination.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PAVGW </c> instruction.
    +///
    +/// \param __a
    +///    A 64-bit integer vector containing one of the source operands.
    +/// \param __b
    +///    A 64-bit integer vector containing one of the source operands.
    +/// \returns A 64-bit integer vector containing the averages of both 
operands.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
    +_mm_avg_pu16(__m64 __a, __m64 __b)
    +{
    +  return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
    +}
    +
    +/// Subtracts the corresponding 8-bit unsigned integer values of the two
    +///    64-bit vector operands and computes the absolute value for each of 
the
    +///    difference. Then sum of the 8 absolute differences is written to the
    +///    bits [15:0] of the destination; the remaining bits [63:16] are 
cleared.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> PSADBW </c> instruction.
    +///
    +/// \param __a
    +///    A 64-bit integer vector containing one of the source operands.
    +/// \param __b
    +///    A 64-bit integer vector containing one of the source operands.
    +/// \returns A 64-bit integer vector whose lower 16 bits contain the sums 
of the
    +///    sets of absolute differences between both operands. The upper bits 
are
    +///    cleared.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
    +_mm_sad_pu8(__m64 __a, __m64 __b)
    +{
    +  return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
    +}
    +
    +#if defined(__cplusplus)
    +extern "C" {
    +#endif
    +
    +/// Returns the contents of the MXCSR register as a 32-bit unsigned
    +///    integer value.
    +///
    +///    There are several groups of macros associated with this
    +///    intrinsic, including:
    +///    <ul>
    +///    <li>
    +///      For checking exception states: _MM_EXCEPT_INVALID, 
_MM_EXCEPT_DIV_ZERO,
    +///      _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
    +///      _MM_EXCEPT_INEXACT. There is a convenience wrapper
    +///      _MM_GET_EXCEPTION_STATE().
    +///    </li>
    +///    <li>
    +///      For checking exception masks: _MM_MASK_UNDERFLOW, 
_MM_MASK_OVERFLOW,
    +///      _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, 
_MM_MASK_INEXACT.
    +///      There is a convenience wrapper _MM_GET_EXCEPTION_MASK().
    +///    </li>
    +///    <li>
    +///      For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
    +///      _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience 
wrapper
    +///      _MM_GET_ROUNDING_MODE().
    +///    </li>
    +///    <li>
    +///      For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, 
_MM_FLUSH_ZERO_OFF.
    +///      There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE().
    +///    </li>
    +///    <li>
    +///      For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
    +///      _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
    +///      _MM_GET_DENORMALS_ZERO_MODE().
    +///    </li>
    +///    </ul>
    +///
    +///    For example, the following expression checks if an overflow 
exception has
    +///    occurred:
    +///    \code
    +///      ( _mm_getcsr() & _MM_EXCEPT_OVERFLOW )
    +///    \endcode
    +///
    +///    The following expression gets the current rounding mode:
    +///    \code
    +///      _MM_GET_ROUNDING_MODE()
    +///    \endcode
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VSTMXCSR / STMXCSR </c> 
instruction.
    +///
    +/// \returns A 32-bit unsigned integer containing the contents of the MXCSR
    +///    register.
    +unsigned int _mm_getcsr(void);
    +
    +/// Sets the MXCSR register with the 32-bit unsigned integer value.
    +///
    +///    There are several groups of macros associated with this intrinsic,
    +///    including:
    +///    <ul>
    +///    <li>
    +///      For setting exception states: _MM_EXCEPT_INVALID, 
_MM_EXCEPT_DIV_ZERO,
    +///      _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
    +///      _MM_EXCEPT_INEXACT. There is a convenience wrapper
    +///      _MM_SET_EXCEPTION_STATE(x) where x is one of these macros.
    +///    </li>
    +///    <li>
    +///      For setting exception masks: _MM_MASK_UNDERFLOW, 
_MM_MASK_OVERFLOW,
    +///      _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, 
_MM_MASK_INEXACT.
    +///      There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x 
is one
    +///      of these macros.
    +///    </li>
    +///    <li>
    +///      For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
    +///      _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience 
wrapper
    +///      _MM_SET_ROUNDING_MODE(x) where x is one of these macros.
    +///    </li>
    +///    <li>
    +///      For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, 
_MM_FLUSH_ZERO_OFF.
    +///      There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x 
is
    +///      one of these macros.
    +///    </li>
    +///    <li>
    +///      For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
    +///      _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
    +///      _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros.
    +///    </li>
    +///    </ul>
    +///
    +///    For example, the following expression causes subsequent 
floating-point
    +///    operations to round up:
    +///      _mm_setcsr(_mm_getcsr() | _MM_ROUND_UP)
    +///
    +///    The following example sets the DAZ and FTZ flags:
    +///    \code
    +///    void setFlags() {
    +///      _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
    +///      _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
    +///    }
    +///    \endcode
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VLDMXCSR / LDMXCSR </c> 
instruction.
    +///
    +/// \param __i
    +///    A 32-bit unsigned integer value to be written to the MXCSR register.
    +void _mm_setcsr(unsigned int __i);
    +
    +#if defined(__cplusplus)
    +} // extern "C"
    +#endif
    +
    +/// Selects 4 float values from the 128-bit operands of [4 x float], as
    +///    specified by the immediate value operand.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// \code
    +/// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask);
    +/// \endcode
    +///
    +/// This intrinsic corresponds to the <c> VSHUFPS / SHUFPS </c> 
instruction.
    +///
    +/// \param a
    +///    A 128-bit vector of [4 x float].
    +/// \param b
    +///    A 128-bit vector of [4 x float].
    +/// \param mask
    +///    An immediate value containing an 8-bit value specifying which 
elements to
    +///    copy from \a a and \a b. \n
    +///    Bits [3:0] specify the values copied from operand \a a. \n
    +///    Bits [7:4] specify the values copied from operand \a b. \n
    +///    The destinations within the 128-bit destination are assigned values 
as
    +///    follows: \n
    +///    Bits [1:0] are used to assign values to bits [31:0] in the
    +///    destination. \n
    +///    Bits [3:2] are used to assign values to bits [63:32] in the
    +///    destination. \n
    +///    Bits [5:4] are used to assign values to bits [95:64] in the
    +///    destination. \n
    +///    Bits [7:6] are used to assign values to bits [127:96] in the
    +///    destination. \n
    +///    Bit value assignments: \n
    +///    00: Bits [31:0] copied from the specified operand. \n
    +///    01: Bits [63:32] copied from the specified operand. \n
    +///    10: Bits [95:64] copied from the specified operand. \n
    +///    11: Bits [127:96] copied from the specified operand.
    +/// \returns A 128-bit vector of [4 x float] containing the shuffled 
values.
    +#define _mm_shuffle_ps(a, b, mask) \
    +  (__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
    +                                (int)(mask))
    +
    +/// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
    +///    [4 x float] and interleaves them into a 128-bit vector of [4 x 
float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VUNPCKHPS / UNPCKHPS </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float]. \n
    +///    Bits [95:64] are written to bits [31:0] of the destination. \n
    +///    Bits [127:96] are written to bits [95:64] of the destination.
    +/// \param __b
    +///    A 128-bit vector of [4 x float].
    +///    Bits [95:64] are written to bits [63:32] of the destination. \n
    +///    Bits [127:96] are written to bits [127:96] of the destination.
    +/// \returns A 128-bit vector of [4 x float] containing the interleaved 
values.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_unpackhi_ps(__m128 __a, __m128 __b)
    +{
    +#ifdef __GNUC__
    +  return (__m128) __builtin_ia32_unpckhps ((__v4sf)__a, (__v4sf)__b);
    +#else
    +  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7);
    +#endif
    +}
    +
    +/// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
    +///    [4 x float] and interleaves them into a 128-bit vector of [4 x 
float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VUNPCKLPS / UNPCKLPS </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit vector of [4 x float]. \n
    +///    Bits [31:0] are written to bits [31:0] of the destination.  \n
    +///    Bits [63:32] are written to bits [95:64] of the destination.
    +/// \param __b
    +///    A 128-bit vector of [4 x float]. \n
    +///    Bits [31:0] are written to bits [63:32] of the destination. \n
    +///    Bits [63:32] are written to bits [127:96] of the destination.
    +/// \returns A 128-bit vector of [4 x float] containing the interleaved 
values.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_unpacklo_ps(__m128 __a, __m128 __b)
    +{
    +#ifdef __GNUC__
    +  return (__m128) __builtin_ia32_unpcklps ((__v4sf)__a, (__v4sf)__b);
    +#else
    +  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5);
    +#endif
    +}
    +
    +/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
    +///    32 bits are set to the lower 32 bits of the second parameter. The 
upper
    +///    96 bits are set to the upper 96 bits of the first parameter.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS / MOVSS </c>
    +///    instruction.
    +///
    +/// \param __a
    +///    A 128-bit floating-point vector of [4 x float]. The upper 96 bits 
are
    +///    written to the upper 96 bits of the result.
    +/// \param __b
    +///    A 128-bit floating-point vector of [4 x float]. The lower 32 bits 
are
    +///    written to the lower 32 bits of the result.
    +/// \returns A 128-bit floating-point vector of [4 x float].
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_move_ss(__m128 __a, __m128 __b)
    +{
    +  __a[0] = __b[0];
    +  return __a;
    +}
    +
    +/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
    +///    64 bits are set to the upper 64 bits of the second parameter. The 
upper
    +///    64 bits are set to the upper 64 bits of the first parameter.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit floating-point vector of [4 x float]. The upper 64 bits 
are
    +///    written to the upper 64 bits of the result.
    +/// \param __b
    +///    A 128-bit floating-point vector of [4 x float]. The upper 64 bits 
are
    +///    written to the lower 64 bits of the result.
    +/// \returns A 128-bit floating-point vector of [4 x float].
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_movehl_ps(__m128 __a, __m128 __b)
    +{
    +#ifdef __GNUC__
    +  return (__m128) __builtin_ia32_movhlps ((__v4sf)__a, (__v4sf)__b);
    +#else
    +  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3);
    +#endif
    +}
    +
    +/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
    +///    64 bits are set to the lower 64 bits of the first parameter. The 
upper
    +///    64 bits are set to the lower 64 bits of the second parameter.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit floating-point vector of [4 x float]. The lower 64 bits 
are
    +///    written to the lower 64 bits of the result.
    +/// \param __b
    +///    A 128-bit floating-point vector of [4 x float]. The lower 64 bits 
are
    +///    written to the upper 64 bits of the result.
    +/// \returns A 128-bit floating-point vector of [4 x float].
    +static __inline__ __m128 __DEFAULT_FN_ATTRS
    +_mm_movelh_ps(__m128 __a, __m128 __b)
    +{
    +#ifdef __GNUC__
    +  return (__m128) __builtin_ia32_movlhps ((__v4sf)__a, (__v4sf)__b);
    +#else
    +  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5);
    +#endif
    +}
    +
    +/// Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x
    +///    float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> 
instruction.
    +///
    +/// \param __a
    +///    A 64-bit vector of [4 x i16]. The elements of the destination are 
copied
    +///    from the corresponding elements in this operand.
    +/// \returns A 128-bit vector of [4 x float] containing the copied and 
converted
    +///    values from the operand.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
    +_mm_cvtpi16_ps(__m64 __a)
    +{
    +  __m64 __b, __c;
    +  __m128 __r;
    +
    +  __b = _mm_setzero_si64();
    +  __b = _mm_cmpgt_pi16(__b, __a);
    +  __c = _mm_unpackhi_pi16(__a, __b);
    +  __r = _mm_setzero_ps();
    +  __r = _mm_cvtpi32_ps(__r, __c);
    +  __r = _mm_movelh_ps(__r, __r);
    +  __c = _mm_unpacklo_pi16(__a, __b);
    +  __r = _mm_cvtpi32_ps(__r, __c);
    +
    +  return __r;
    +}
    +
    +/// Converts a 64-bit vector of 16-bit unsigned integer values into a
    +///    128-bit vector of [4 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> 
instruction.
    +///
    +/// \param __a
    +///    A 64-bit vector of 16-bit unsigned integer values. The elements of 
the
    +///    destination are copied from the corresponding elements in this 
operand.
    +/// \returns A 128-bit vector of [4 x float] containing the copied and 
converted
    +///    values from the operand.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
    +_mm_cvtpu16_ps(__m64 __a)
    +{
    +  __m64 __b, __c;
    +  __m128 __r;
    +
    +  __b = _mm_setzero_si64();
    +  __c = _mm_unpackhi_pi16(__a, __b);
    +  __r = _mm_setzero_ps();
    +  __r = _mm_cvtpi32_ps(__r, __c);
    +  __r = _mm_movelh_ps(__r, __r);
    +  __c = _mm_unpacklo_pi16(__a, __b);
    +  __r = _mm_cvtpi32_ps(__r, __c);
    +
    +  return __r;
    +}
    +
    +/// Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
    +///    into a 128-bit vector of [4 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> 
instruction.
    +///
    +/// \param __a
    +///    A 64-bit vector of [8 x i8]. The elements of the destination are 
copied
    +///    from the corresponding lower 4 elements in this operand.
    +/// \returns A 128-bit vector of [4 x float] containing the copied and 
converted
    +///    values from the operand.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
    +_mm_cvtpi8_ps(__m64 __a)
    +{
    +  __m64 __b;
    +
    +  __b = _mm_setzero_si64();
    +  __b = _mm_cmpgt_pi8(__b, __a);
    +  __b = _mm_unpacklo_pi8(__a, __b);
    +
    +  return _mm_cvtpi16_ps(__b);
    +}
    +
    +/// Converts the lower four unsigned 8-bit integer values from a 64-bit
    +///    vector of [8 x u8] into a 128-bit vector of [4 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> 
instruction.
    +///
    +/// \param __a
    +///    A 64-bit vector of unsigned 8-bit integer values. The elements of 
the
    +///    destination are copied from the corresponding lower 4 elements in 
this
    +///    operand.
    +/// \returns A 128-bit vector of [4 x float] containing the copied and 
converted
    +///    values from the source operand.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
    +_mm_cvtpu8_ps(__m64 __a)
    +{
    +  __m64 __b;
    +
    +  __b = _mm_setzero_si64();
    +  __b = _mm_unpacklo_pi8(__a, __b);
    +
    +  return _mm_cvtpi16_ps(__b);
    +}
    +
    +/// Converts the two 32-bit signed integer values from each 64-bit vector
    +///    operand of [2 x i32] into a 128-bit vector of [4 x float].
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> 
instruction.
    +///
    +/// \param __a
    +///    A 64-bit vector of [2 x i32]. The lower elements of the destination 
are
    +///    copied from the elements in this operand.
    +/// \param __b
    +///    A 64-bit vector of [2 x i32]. The upper elements of the destination 
are
    +///    copied from the elements in this operand.
    +/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain 
the
    +///    copied and converted values from the first operand. The upper 64 
bits
    +///    contain the copied and converted values from the second operand.
    +static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
    +_mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
    +{
    +  __m128 __c;
    +
    +  __c = _mm_setzero_ps();
    +  __c = _mm_cvtpi32_ps(__c, __b);
    +  __c = _mm_movelh_ps(__c, __c);
    +
    +  return _mm_cvtpi32_ps(__c, __a);
    +}
    +
    +/// Converts each single-precision floating-point element of a 128-bit
    +///    floating-point vector of [4 x float] into a 16-bit signed integer, 
and
    +///    packs the results into a 64-bit integer vector of [4 x i16].
    +///
    +///    If the floating-point element is NaN or infinity, or if the
    +///    floating-point element is greater than 0x7FFFFFFF or less than 
-0x8000,
    +///    it is converted to 0x8000. Otherwise if the floating-point element 
is
    +///    greater than 0x7FFF, it is converted to 0x7FFF.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit floating-point vector of [4 x float].
    +/// \returns A 64-bit integer vector of [4 x i16] containing the converted
    +///    values.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
    +_mm_cvtps_pi16(__m128 __a)
    +{
    +  __m64 __b, __c;
    +
    +  __b = _mm_cvtps_pi32(__a);
    +  __a = _mm_movehl_ps(__a, __a);
    +  __c = _mm_cvtps_pi32(__a);
    +
    +  return _mm_packs_pi32(__b, __c);
    +}
    +
    +/// Converts each single-precision floating-point element of a 128-bit
    +///    floating-point vector of [4 x float] into an 8-bit signed integer, 
and
    +///    packs the results into the lower 32 bits of a 64-bit integer vector 
of
    +///    [8 x i8]. The upper 32 bits of the vector are set to 0.
    +///
    +///    If the floating-point element is NaN or infinity, or if the
    +///    floating-point element is greater than 0x7FFFFFFF or less than 
-0x80, it
    +///    is converted to 0x80. Otherwise if the floating-point element is 
greater
    +///    than 0x7F, it is converted to 0x7F.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> 
instruction.
    +///
    +/// \param __a
    +///    128-bit floating-point vector of [4 x float].
    +/// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits 
contain the
    +///    converted values and the uppper 32 bits are set to zero.
    +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
    +_mm_cvtps_pi8(__m128 __a)
    +{
    +  __m64 __b, __c;
    +
    +  __b = _mm_cvtps_pi16(__a);
    +  __c = _mm_setzero_si64();
    +
    +  return _mm_packs_pi16(__b, __c);
    +}
    +
    +/// Extracts the sign bits from each single-precision floating-point
    +///    element of a 128-bit floating-point vector of [4 x float] and 
returns the
    +///    sign bits in bits [0:3] of the result. Bits [31:4] of the result 
are set
    +///    to zero.
    +///
    +/// \headerfile <x86intrin.h>
    +///
    +/// This intrinsic corresponds to the <c> VMOVMSKPS / MOVMSKPS </c> 
instruction.
    +///
    +/// \param __a
    +///    A 128-bit floating-point vector of [4 x float].
    +/// \returns A 32-bit integer value. Bits [3:0] contain the sign bits from 
each
    +///    single-precision floating-point element of the parameter. Bits 
[31:4] are
    +///    set to zero.
    +static __inline__ int __DEFAULT_FN_ATTRS
    +_mm_movemask_ps(__m128 __a)
    +{
    +  return __builtin_ia32_movmskps((__v4sf)__a);
    +}
    +
    +
    +#define _MM_ALIGN16 __attribute__((aligned(16)))
    +
    +
    +#define _MM_EXCEPT_INVALID    (0x0001)
    +#define _MM_EXCEPT_DENORM     (0x0002)
    +#define _MM_EXCEPT_DIV_ZERO   (0x0004)
    +#define _MM_EXCEPT_OVERFLOW   (0x0008)
    +#define _MM_EXCEPT_UNDERFLOW  (0x0010)
    +#define _MM_EXCEPT_INEXACT    (0x0020)
    +#define _MM_EXCEPT_MASK       (0x003f)
    +
    +#define _MM_MASK_INVALID      (0x0080)
    +#define _MM_MASK_DENORM       (0x0100)
    +#define _MM_MASK_DIV_ZERO     (0x0200)
    +#define _MM_MASK_OVERFLOW     (0x0400)
    +#define _MM_MASK_UNDERFLOW    (0x0800)
    +#define _MM_MASK_INEXACT      (0x1000)
    +#define _MM_MASK_MASK         (0x1f80)
    +
    +#define _MM_ROUND_NEAREST     (0x0000)
    +#define _MM_ROUND_DOWN        (0x2000)
    +#define _MM_ROUND_UP          (0x4000)
    +#define _MM_ROUND_TOWARD_ZERO (0x6000)
    +#define _MM_ROUND_MASK        (0x6000)
    +
    +#define _MM_FLUSH_ZERO_MASK   (0x8000)
    +#define _MM_FLUSH_ZERO_ON     (0x8000)
    +#define _MM_FLUSH_ZERO_OFF    (0x0000)
    +
    +#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
    +#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
    +#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
    +#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
    +
    +#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & 
~_MM_MASK_MASK) | (x)))
    +#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & 
~_MM_EXCEPT_MASK) | (x)))
    +#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & 
~_MM_FLUSH_ZERO_MASK) | (x)))
    +#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & 
~_MM_ROUND_MASK) | (x)))
    +
    +#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
    +do { \
    +  __m128 tmp3, tmp2, tmp1, tmp0; \
    +  tmp0 = _mm_unpacklo_ps((row0), (row1)); \
    +  tmp2 = _mm_unpacklo_ps((row2), (row3)); \
    +  tmp1 = _mm_unpackhi_ps((row0), (row1)); \
    +  tmp3 = _mm_unpackhi_ps((row2), (row3)); \
    +  (row0) = _mm_movelh_ps(tmp0, tmp2); \
    +  (row1) = _mm_movehl_ps(tmp2, tmp0); \
    +  (row2) = _mm_movelh_ps(tmp1, tmp3); \
    +  (row3) = _mm_movehl_ps(tmp3, tmp1); \
    +} while (0)
    +
    +/* Aliases for compatibility. */
    +#define _m_pextrw _mm_extract_pi16
    +#define _m_pinsrw _mm_insert_pi16
    +#define _m_pmaxsw _mm_max_pi16
    +#define _m_pmaxub _mm_max_pu8
    +#define _m_pminsw _mm_min_pi16
    +#define _m_pminub _mm_min_pu8
    +#define _m_pmovmskb _mm_movemask_pi8
    +#define _m_pmulhuw _mm_mulhi_pu16
    +#define _m_pshufw _mm_shuffle_pi16
    +#define _m_maskmovq _mm_maskmove_si64
    +#define _m_pavgb _mm_avg_pu8
    +#define _m_pavgw _mm_avg_pu16
    +#define _m_psadbw _mm_sad_pu8
    +#define _m_ _mm_
    +#define _m_ _mm_
    +
    +#undef __DEFAULT_FN_ATTRS
    +#undef __DEFAULT_FN_ATTRS_MMX
    +
    +/* Ugly hack for backwards-compatibility (compatible with gcc) */
    +#ifdef __GNUC__
    +#include <emmintrin.h>
    +#else
    +#if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
    +#include <emmintrin.h>
    +#endif
    +#endif
    +
    +#endif /* __XMMINTRIN_H */
    -- 
    2.21.0
    
    

_______________________________________________
Minios-devel mailing list
Minios-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/minios-devel

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.