// Written in the D programming language.
/**
 * Builtin SIMD intrinsics
 *
 * Source: $(DRUNTIMESRC core/_simd.d)
 *
 * Copyright: Copyright Digital Mars 2012-2020
 * License:   $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
 * Authors:   $(HTTP digitalmars.com, Walter Bright),
 * Source:    $(DRUNTIMESRC core/_simd.d)
 */
module core.simd;
pure:
nothrow:
@safe:
@nogc:
/*******************************
 * Create a vector type.
 *
 * Parameters:
 *      T = one of double[2], float[4], void[16], byte[16], ubyte[16],
 *      short[8], ushort[8], int[4], uint[4], long[2], ulong[2].
 *      For 256 bit vectors,
 *      one of double[4], float[8], void[32], byte[32], ubyte[32],
 *      short[16], ushort[16], int[8], uint[8], long[4], ulong[4]
 */
template Vector(T)
{
    /* __vector is compiler magic, hide it behind a template.
     * The compiler will reject T's that don't work.
     */
    alias __vector(T) Vector;
}
/* Handy aliases
 */
static if (is(Vector!(void[8])))    alias Vector!(void[8])    void8;        ///
static if (is(Vector!(double[1])))  alias Vector!(double[1])  double1;      ///
static if (is(Vector!(float[2])))   alias Vector!(float[2])   float2;       ///
static if (is(Vector!(byte[8])))    alias Vector!(byte[8])    byte8;        ///
static if (is(Vector!(ubyte[8])))   alias Vector!(ubyte[8])   ubyte8;       ///
static if (is(Vector!(short[4])))   alias Vector!(short[4])   short4;       ///
static if (is(Vector!(ushort[4])))  alias Vector!(ushort[4])  ushort4;      ///
static if (is(Vector!(int[2])))     alias Vector!(int[2])     int2;         ///
static if (is(Vector!(uint[2])))    alias Vector!(uint[2])    uint2;        ///
static if (is(Vector!(long[1])))    alias Vector!(long[1])    long1;        ///
static if (is(Vector!(ulong[1])))   alias Vector!(ulong[1])   ulong1;       ///
static if (is(Vector!(void[16])))   alias Vector!(void[16])   void16;       ///
static if (is(Vector!(double[2])))  alias Vector!(double[2])  double2;      ///
static if (is(Vector!(float[4])))   alias Vector!(float[4])   float4;       ///
static if (is(Vector!(byte[16])))   alias Vector!(byte[16])   byte16;       ///
static if (is(Vector!(ubyte[16])))  alias Vector!(ubyte[16])  ubyte16;      ///
static if (is(Vector!(short[8])))   alias Vector!(short[8])   short8;       ///
static if (is(Vector!(ushort[8])))  alias Vector!(ushort[8])  ushort8;      ///
static if (is(Vector!(int[4])))     alias Vector!(int[4])     int4;         ///
static if (is(Vector!(uint[4])))    alias Vector!(uint[4])    uint4;        ///
static if (is(Vector!(long[2])))    alias Vector!(long[2])    long2;        ///
static if (is(Vector!(ulong[2])))   alias Vector!(ulong[2])   ulong2;       ///
static if (is(Vector!(void[32])))   alias Vector!(void[32])   void32;       ///
static if (is(Vector!(double[4])))  alias Vector!(double[4])  double4;      ///
static if (is(Vector!(float[8])))   alias Vector!(float[8])   float8;       ///
static if (is(Vector!(byte[32])))   alias Vector!(byte[32])   byte32;       ///
static if (is(Vector!(ubyte[32])))  alias Vector!(ubyte[32])  ubyte32;      ///
static if (is(Vector!(short[16])))  alias Vector!(short[16])  short16;      ///
static if (is(Vector!(ushort[16]))) alias Vector!(ushort[16]) ushort16;     ///
static if (is(Vector!(int[8])))     alias Vector!(int[8])     int8;         ///
static if (is(Vector!(uint[8])))    alias Vector!(uint[8])    uint8;        ///
static if (is(Vector!(long[4])))    alias Vector!(long[4])    long4;        ///
static if (is(Vector!(ulong[4])))   alias Vector!(ulong[4])   ulong4;       ///
static if (is(Vector!(void[64])))   alias Vector!(void[64])   void64;       ///
static if (is(Vector!(double[8])))  alias Vector!(double[8])  double8;      ///
static if (is(Vector!(float[16])))  alias Vector!(float[16])  float16;      ///
static if (is(Vector!(byte[64])))   alias Vector!(byte[64])   byte64;       ///
static if (is(Vector!(ubyte[64])))  alias Vector!(ubyte[64])  ubyte64;      ///
static if (is(Vector!(short[32])))  alias Vector!(short[32])  short32;      ///
static if (is(Vector!(ushort[32]))) alias Vector!(ushort[32]) ushort32;     ///
static if (is(Vector!(int[16])))    alias Vector!(int[16])    int16;        ///
static if (is(Vector!(uint[16])))   alias Vector!(uint[16])   uint16;       ///
static if (is(Vector!(long[8])))    alias Vector!(long[8])    long8;        ///
static if (is(Vector!(ulong[8])))   alias Vector!(ulong[8])   ulong8;       ///
version (D_SIMD)
{
    /** XMM opcodes that conform to the following:
    *
    *  opcode xmm1,xmm2/mem
    *
    * and do not have side effects (i.e. do not write to memory).
    */
    enum XMM
    {
        ADDSS = 0xF30F58,
        ADDSD = 0xF20F58,
        ADDPS = 0x000F58,
        ADDPD = 0x660F58,
        PADDB = 0x660FFC,
        PADDW = 0x660FFD,
        PADDD = 0x660FFE,
        PADDQ = 0x660FD4,
        SUBSS = 0xF30F5C,
        SUBSD = 0xF20F5C,
        SUBPS = 0x000F5C,
        SUBPD = 0x660F5C,
        PSUBB = 0x660FF8,
        PSUBW = 0x660FF9,
        PSUBD = 0x660FFA,
        PSUBQ = 0x660FFB,
        MULSS = 0xF30F59,
        MULSD = 0xF20F59,
        MULPS = 0x000F59,
        MULPD = 0x660F59,
        PMULLW = 0x660FD5,
        DIVSS = 0xF30F5E,
        DIVSD = 0xF20F5E,
        DIVPS = 0x000F5E,
        DIVPD = 0x660F5E,
        PAND  = 0x660FDB,
        POR   = 0x660FEB,
        UCOMISS = 0x000F2E,
        UCOMISD = 0x660F2E,
        XORPS = 0x000F57,
        XORPD = 0x660F57,
        // Use STO and LOD instead of MOV to distinguish the direction
        // (Destination is first operand, Source is second operand)
        STOSS  = 0xF30F11,        /// MOVSS xmm1/m32, xmm2
        STOSD  = 0xF20F11,        /// MOVSD xmm1/m64, xmm2
        STOAPS = 0x000F29,        /// MOVAPS xmm2/m128, xmm1
        STOAPD = 0x660F29,        /// MOVAPD xmm2/m128, xmm1
        STODQA = 0x660F7F,        /// MOVDQA xmm2/m128, xmm1
        STOD   = 0x660F7E,        /// MOVD reg/mem64, xmm   66 0F 7E /r
        STOQ   = 0x660FD6,        /// MOVQ xmm2/m64, xmm1
        LODSS  = 0xF30F10,        /// MOVSS xmm1, xmm2/m32
        LODSD  = 0xF20F10,        /// MOVSD xmm1, xmm2/m64
        LODAPS = 0x000F28,        /// MOVAPS xmm1, xmm2/m128
        LODAPD = 0x660F28,        /// MOVAPD xmm1, xmm2/m128
        LODDQA = 0x660F6F,        /// MOVDQA xmm1, xmm2/m128
        LODD   = 0x660F6E,        /// MOVD xmm, reg/mem64   66 0F 6E /r
        LODQ   = 0xF30F7E,        /// MOVQ xmm1, xmm2/m64
        LODDQU   = 0xF30F6F,      /// MOVDQU xmm1, xmm2/mem128  F3 0F 6F /r
        STODQU   = 0xF30F7F,      /// MOVDQU xmm1/mem128, xmm2  F3 0F 7F /r
        MOVDQ2Q  = 0xF20FD6,      /// MOVDQ2Q mmx, xmm          F2 0F D6 /r
        MOVHLPS  = 0x0F12,        /// MOVHLPS xmm1, xmm2        0F 12 /r
        LODHPD   = 0x660F16,      /// MOVHPD xmm1, m64
        STOHPD   = 0x660F17,      /// MOVHPD mem64, xmm1        66 0F 17 /r
        LODHPS   = 0x0F16,        /// MOVHPS xmm1, m64
        STOHPS   = 0x0F17,        /// MOVHPS m64, xmm1
        MOVLHPS  = 0x0F16,        /// MOVLHPS xmm1, xmm2
        LODLPD   = 0x660F12,      /// MOVLPD xmm1, m64
        STOLPD   = 0x660F13,      /// MOVLPD m64, xmm1
        LODLPS   = 0x0F12,        /// MOVLPS xmm1, m64
        STOLPS   = 0x0F13,        /// MOVLPS m64, xmm1
        MOVMSKPD = 0x660F50,      /// MOVMSKPD reg, xmm
        MOVMSKPS = 0x0F50,        /// MOVMSKPS reg, xmm
        MOVNTDQ  = 0x660FE7,      /// MOVNTDQ m128, xmm1
        MOVNTI   = 0x0FC3,        /// MOVNTI m32, r32
        MOVNTPD  = 0x660F2B,      /// MOVNTPD m128, xmm1
        MOVNTPS  = 0x0F2B,        /// MOVNTPS m128, xmm1
        MOVNTQ   = 0x0FE7,        /// MOVNTQ m64, mm
        MOVQ2DQ  = 0xF30FD6,      /// MOVQ2DQ
        LODUPD   = 0x660F10,      /// MOVUPD xmm1, xmm2/m128
        STOUPD   = 0x660F11,      /// MOVUPD xmm2/m128, xmm1
        LODUPS   = 0x0F10,        /// MOVUPS xmm1, xmm2/m128
        STOUPS   = 0x0F11,        /// MOVUPS xmm2/m128, xmm1
        PACKSSDW = 0x660F6B,
        PACKSSWB = 0x660F63,
        PACKUSWB = 0x660F67,
        PADDSB = 0x660FEC,
        PADDSW = 0x660FED,
        PADDUSB = 0x660FDC,
        PADDUSW = 0x660FDD,
        PANDN = 0x660FDF,
        PCMPEQB = 0x660F74,
        PCMPEQD = 0x660F76,
        PCMPEQW = 0x660F75,
        PCMPGTB = 0x660F64,
        PCMPGTD = 0x660F66,
        PCMPGTW = 0x660F65,
        PMADDWD = 0x660FF5,
        PSLLW = 0x660FF1,
        PSLLD = 0x660FF2,
        PSLLQ = 0x660FF3,
        PSRAW = 0x660FE1,
        PSRAD = 0x660FE2,
        PSRLW = 0x660FD1,
        PSRLD = 0x660FD2,
        PSRLQ = 0x660FD3,
        PSUBSB = 0x660FE8,
        PSUBSW = 0x660FE9,
        PSUBUSB = 0x660FD8,
        PSUBUSW = 0x660FD9,
        PUNPCKHBW = 0x660F68,
        PUNPCKHDQ = 0x660F6A,
        PUNPCKHWD = 0x660F69,
        PUNPCKLBW = 0x660F60,
        PUNPCKLDQ = 0x660F62,
        PUNPCKLWD = 0x660F61,
        PXOR = 0x660FEF,
        ANDPD = 0x660F54,
        ANDPS = 0x0F54,
        ANDNPD = 0x660F55,
        ANDNPS = 0x0F55,
        CMPPS = 0x0FC2,
        CMPPD = 0x660FC2,
        CMPSD = 0xF20FC2,
        CMPSS = 0xF30FC2,
        COMISD = 0x660F2F,
        COMISS = 0x0F2F,
        CVTDQ2PD = 0xF30FE6,
        CVTDQ2PS = 0x0F5B,
        CVTPD2DQ = 0xF20FE6,
        CVTPD2PI = 0x660F2D,
        CVTPD2PS = 0x660F5A,
        CVTPI2PD = 0x660F2A,
        CVTPI2PS = 0x0F2A,
        CVTPS2DQ = 0x660F5B,
        CVTPS2PD = 0x0F5A,
        CVTPS2PI = 0x0F2D,
        CVTSD2SI = 0xF20F2D,
        CVTSD2SS = 0xF20F5A,
        CVTSI2SD = 0xF20F2A,
        CVTSI2SS = 0xF30F2A,
        CVTSS2SD = 0xF30F5A,
        CVTSS2SI = 0xF30F2D,
        CVTTPD2PI = 0x660F2C,
        CVTTPD2DQ = 0x660FE6,
        CVTTPS2DQ = 0xF30F5B,
        CVTTPS2PI = 0x0F2C,
        CVTTSD2SI = 0xF20F2C,
        CVTTSS2SI = 0xF30F2C,
        MASKMOVDQU = 0x660FF7,
        MASKMOVQ = 0x0FF7,
        MAXPD = 0x660F5F,
        MAXPS = 0x0F5F,
        MAXSD = 0xF20F5F,
        MAXSS = 0xF30F5F,
        MINPD = 0x660F5D,
        MINPS = 0x0F5D,
        MINSD = 0xF20F5D,
        MINSS = 0xF30F5D,
        ORPD = 0x660F56,
        ORPS = 0x0F56,
        PAVGB = 0x660FE0,
        PAVGW = 0x660FE3,
        PMAXSW = 0x660FEE,
        //PINSRW = 0x660FC4,
        PMAXUB = 0x660FDE,
        PMINSW = 0x660FEA,
        PMINUB = 0x660FDA,
        //PMOVMSKB = 0x660FD7,
        PMULHUW = 0x660FE4,
        PMULHW = 0x660FE5,
        PMULUDQ = 0x660FF4,
        PSADBW = 0x660FF6,
        PUNPCKHQDQ = 0x660F6D,
        PUNPCKLQDQ = 0x660F6C,
        RCPPS = 0x0F53,
        RCPSS = 0xF30F53,
        RSQRTPS = 0x0F52,
        RSQRTSS = 0xF30F52,
        SQRTPD = 0x660F51,
        SHUFPD = 0x660FC6,
        SHUFPS = 0x0FC6,
        SQRTPS = 0x0F51,
        SQRTSD = 0xF20F51,
        SQRTSS = 0xF30F51,
        UNPCKHPD = 0x660F15,
        UNPCKHPS = 0x0F15,
        UNPCKLPD = 0x660F14,
        UNPCKLPS = 0x0F14,
        PSHUFD = 0x660F70,
        PSHUFHW = 0xF30F70,
        PSHUFLW = 0xF20F70,
        PSHUFW = 0x0F70,
        PSLLDQ = 0x07660F73,
        PSRLDQ = 0x03660F73,
        //PREFETCH = 0x0F18,
        // SSE3 Pentium 4 (Prescott)
        ADDSUBPD = 0x660FD0,
        ADDSUBPS = 0xF20FD0,
        HADDPD   = 0x660F7C,
        HADDPS   = 0xF20F7C,
        HSUBPD   = 0x660F7D,
        HSUBPS   = 0xF20F7D,
        MOVDDUP  = 0xF20F12,
        MOVSHDUP = 0xF30F16,
        MOVSLDUP = 0xF30F12,
        LDDQU    = 0xF20FF0,
        MONITOR  = 0x0F01C8,
        MWAIT    = 0x0F01C9,
        // SSSE3
        PALIGNR = 0x660F3A0F,
        PHADDD = 0x660F3802,
        PHADDW = 0x660F3801,
        PHADDSW = 0x660F3803,
        PABSB = 0x660F381C,
        PABSD = 0x660F381E,
        PABSW = 0x660F381D,
        PSIGNB = 0x660F3808,
        PSIGND = 0x660F380A,
        PSIGNW = 0x660F3809,
        PSHUFB = 0x660F3800,
        PMADDUBSW = 0x660F3804,
        PMULHRSW = 0x660F380B,
        PHSUBD = 0x660F3806,
        PHSUBW = 0x660F3805,
        PHSUBSW = 0x660F3807,
        // SSE4.1
        BLENDPD   = 0x660F3A0D,
        BLENDPS   = 0x660F3A0C,
        BLENDVPD  = 0x660F3815,
        BLENDVPS  = 0x660F3814,
        DPPD      = 0x660F3A41,
        DPPS      = 0x660F3A40,
        EXTRACTPS = 0x660F3A17,
        INSERTPS  = 0x660F3A21,
        MPSADBW   = 0x660F3A42,
        PBLENDVB  = 0x660F3810,
        PBLENDW   = 0x660F3A0E,
        PEXTRD    = 0x660F3A16,
        PEXTRQ    = 0x660F3A16,
        PINSRB    = 0x660F3A20,
        PINSRD    = 0x660F3A22,
        PINSRQ    = 0x660F3A22,
        MOVNTDQA = 0x660F382A,
        PACKUSDW = 0x660F382B,
        PCMPEQQ = 0x660F3829,
        PEXTRB = 0x660F3A14,
        PHMINPOSUW = 0x660F3841,
        PMAXSB = 0x660F383C,
        PMAXSD = 0x660F383D,
        PMAXUD = 0x660F383F,
        PMAXUW = 0x660F383E,
        PMINSB = 0x660F3838,
        PMINSD = 0x660F3839,
        PMINUD = 0x660F383B,
        PMINUW = 0x660F383A,
        PMOVSXBW = 0x660F3820,
        PMOVSXBD = 0x660F3821,
        PMOVSXBQ = 0x660F3822,
        PMOVSXWD = 0x660F3823,
        PMOVSXWQ = 0x660F3824,
        PMOVSXDQ = 0x660F3825,
        PMOVZXBW = 0x660F3830,
        PMOVZXBD = 0x660F3831,
        PMOVZXBQ = 0x660F3832,
        PMOVZXWD = 0x660F3833,
        PMOVZXWQ = 0x660F3834,
        PMOVZXDQ = 0x660F3835,
        PMULDQ   = 0x660F3828,
        PMULLD   = 0x660F3840,
        PTEST    = 0x660F3817,
        ROUNDPD = 0x660F3A09,
        ROUNDPS = 0x660F3A08,
        ROUNDSD = 0x660F3A0B,
        ROUNDSS = 0x660F3A0A,
        // SSE4.2
        PCMPESTRI  = 0x660F3A61,
        PCMPESTRM  = 0x660F3A60,
        PCMPISTRI  = 0x660F3A63,
        PCMPISTRM  = 0x660F3A62,
        PCMPGTQ    = 0x660F3837,
        //CRC32
        // SSE4a (AMD only)
        // EXTRQ,INSERTQ,MOVNTSD,MOVNTSS
        // POPCNT and LZCNT (have their own CPUID bits)
        POPCNT     = 0xF30FB8,
        // LZCNT
    }
    /**
    * Generate two operand instruction with XMM 128 bit operands.
    *
    * This is a compiler magic function - it doesn't behave like
    * regular D functions.
    *
    * Parameters:
    *      opcode = any of the XMM opcodes; it must be a compile time constant
    *      op1    = first operand
    *      op2    = second operand
    * Returns:
    *      result of opcode
    * Example:
    ---
    import core.simd;
    import core.stdc.stdio;
    void main()
    {
        float4 A = [2.34f, -70000.0f, 0.00001f, 345.5f];
        float4 R = A;
        R = cast(float4) __simd(XMM.RCPSS, R, A);
        printf("%g %g %g %g\n", R.array[0], R.array[1], R.array[2], R.array[3]);
    }
    ---
    * Prints `0.427368 -70000 1e-05 345.5`.
    * The use of the two operand form for `XMM.RCPSS` is necessary because the result of the instruction
    * contains elements of both operands.
    * Example:
    ---
    double[2] A = [56.0, -75.0];
    double2 R = cast(double2) __simd(XMM.LODUPD, *cast(double2*)A.ptr);
    ---
    * The cast to `double2*` is necessary because the type of `*A.ptr` is `double`.
    */
    pure @safe void16 __simd(XMM opcode, void16 op1, void16 op2);
    ///
    unittest
    {
        float4 a;
        a = cast(float4)__simd(XMM.PXOR, a, a);
    }
    /**
    * Unary SIMD instructions.
    */
    pure @safe void16 __simd(XMM opcode, void16 op1);
    pure @safe void16 __simd(XMM opcode, double d);   ///
    pure @safe void16 __simd(XMM opcode, float f);    ///
    ///
    unittest
    {
        float4 a;
        a = cast(float4)__simd(XMM.LODSS, a);
    }
    /****
    * For instructions:
    * CMPPD, CMPSS, CMPSD, CMPPS,
    * PSHUFD, PSHUFHW, PSHUFLW,
    * BLENDPD, BLENDPS, DPPD, DPPS,
    * MPSADBW, PBLENDW,
    * ROUNDPD, ROUNDPS, ROUNDSD, ROUNDSS
    * Parameters:
    *      opcode = any of the above XMM opcodes; it must be a compile time constant
    *      op1    = first operand
    *      op2    = second operand
    *      imm8   = third operand; must be a compile time constant
    * Returns:
    *      result of opcode
    */
    pure @safe void16 __simd(XMM opcode, void16 op1, void16 op2, ubyte imm8);
    ///
    unittest
    {
        float4 a;
        a = cast(float4)__simd(XMM.CMPPD, a, a, 0x7A);
    }
    /***
    * For instructions with the imm8 version:
    * PSLLD, PSLLQ, PSLLW, PSRAD, PSRAW, PSRLD, PSRLQ, PSRLW,
    * PSRLDQ, PSLLDQ
    * Parameters:
    *      opcode = any of the XMM opcodes; it must be a compile time constant
    *      op1    = first operand
    *      imm8   = second operand; must be a compile time constant
    * Returns:
    *      result of opcode
    */
    pure @safe void16 __simd_ib(XMM opcode, void16 op1, ubyte imm8);
    ///
    unittest
    {
        float4 a;
        a = cast(float4) __simd_ib(XMM.PSRLQ, a, 0x7A);
    }
    /*****
    * For "store" operations of the form:
    *    op1 op= op2
    * such as MOVLPS.
    * Returns:
    *    op2
    * These cannot be marked as pure, as semantic() doesn't check them.
    */
    @safe void16 __simd_sto(XMM opcode, void16 op1, void16 op2);
    @safe void16 __simd_sto(XMM opcode, double op1, void16 op2); ///
    @safe void16 __simd_sto(XMM opcode, float op1, void16 op2);  ///
    @safe void16 __simd_sto(XMM opcode, void16 op1, long op2); ///
    ///
    unittest
    {
        void16 a;
        float f = 1;
        double d = 1;
        cast(void)__simd_sto(XMM.STOUPS, a, a);
        cast(void)__simd_sto(XMM.STOUPS, f, a);
        cast(void)__simd_sto(XMM.STOUPS, d, a);
    }
    /* The following use overloading to ensure correct typing.
    * Compile with inlining on for best performance.
    */
    pure @safe short8 pcmpeq()(short8 v1, short8 v2)
    {
        return cast(short8)__simd(XMM.PCMPEQW, v1, v2);
    }
    pure @safe ushort8 pcmpeq()(ushort8 v1, ushort8 v2)
    {
        return cast(ushort8)__simd(XMM.PCMPEQW, v1, v2);
    }
    /*********************
    * Emit prefetch instruction.
    * Params:
    *    address = address to be prefetched
    *    writeFetch = true for write fetch, false for read fetch
    *    locality = 0..3 (0 meaning least local, 3 meaning most local)
    * Note:
    *    The Intel mappings are:
    *    $(TABLE
    *    $(THEAD writeFetch, locality, Instruction)
    *    $(TROW false, 0, prefetchnta)
    *    $(TROW false, 1, prefetch2)
    *    $(TROW false, 2, prefetch1)
    *    $(TROW false, 3, prefetch0)
    *    $(TROW true, 0, prefetchw)
    *    $(TROW true, 1, prefetchw)
    *    $(TROW true, 2, prefetchw)
    *    $(TROW true, 3, prefetchw)
    *    )
    */
    void prefetch(bool writeFetch, ubyte locality)(const(void)* address)
    {
        static if (writeFetch)
            __prefetch(address, 4);
        else static if (locality < 4)
            __prefetch(address, 3 - locality);
        else
            static assert(0, "0..3 expected for locality");
    }
    private void __prefetch(const(void*) address, ubyte encoding);
    /*************************************
    * Load unaligned vector from address.
    * This is a compiler intrinsic.
    * Params:
    *    p = pointer to vector
    * Returns:
    *    vector
    */
    V loadUnaligned(V)(const V* p)
        if (is(V == void16) ||
            is(V == byte16) ||
            is(V == ubyte16) ||
            is(V == short8) ||
            is(V == ushort8) ||
            is(V == int4) ||
            is(V == uint4) ||
            is(V == long2) ||
            is(V == ulong2) ||
            is(V == double2) ||
            is(V == float4))
    {
        pragma(inline, true);
        static if (is(V == double2))
            return cast(V)__simd(XMM.LODUPD, *cast(const void16*)p);
        else static if (is(V == float4))
            return cast(V)__simd(XMM.LODUPS, *cast(const void16*)p);
        else
            return cast(V)__simd(XMM.LODDQU, *cast(const void16*)p);
    }
    @system
    unittest
    {
        // Memory to load into the vector:
        // Should have enough data to test all 16-byte alignments, and still
        // have room for a 16-byte vector
        ubyte[32] data;
        foreach (i; 0..data.length)
        {
            data[i] = cast(ubyte)i;
        }
        // to test all alignments from 1 ~ 16
        foreach (i; 0..16)
        {
            ubyte* d = &data[i];
            void test(T)()
            {
                // load the data
                T v = loadUnaligned(cast(T*)d);
                // check that the data was loaded correctly
                ubyte* ptrToV = cast(ubyte*)&v;
                foreach (j; 0..T.sizeof)
                {
                    assert(ptrToV[j] == d[j]);
                }
            }
            test!void16();
            test!byte16();
            test!ubyte16();
            test!short8();
            test!ushort8();
            test!int4();
            test!uint4();
            test!long2();
            test!ulong2();
            test!double2();
            test!float4();
        }
    }
    /*************************************
    * Store vector to unaligned address.
    * This is a compiler intrinsic.
    * Params:
    *    p = pointer to vector
    *    value = value to store
    * Returns:
    *    value
    */
    V storeUnaligned(V)(V* p, V value)
        if (is(V == void16) ||
            is(V == byte16) ||
            is(V == ubyte16) ||
            is(V == short8) ||
            is(V == ushort8) ||
            is(V == int4) ||
            is(V == uint4) ||
            is(V == long2) ||
            is(V == ulong2) ||
            is(V == double2) ||
            is(V == float4))
    {
        pragma(inline, true);
        static if (is(V == double2))
            return cast(V)__simd_sto(XMM.STOUPD, *cast(void16*)p, value);
        else static if (is(V == float4))
            return cast(V)__simd_sto(XMM.STOUPS, *cast(void16*)p, value);
        else
            return cast(V)__simd_sto(XMM.STODQU, *cast(void16*)p, value);
    }
    @system
    unittest
    {
        // Memory to store the vector to:
        // Should have enough data to test all 16-byte alignments, and still
        // have room for a 16-byte vector
        ubyte[32] data;
        // to test all alignments from 1 ~ 16
        foreach (i; 0..16)
        {
            ubyte* d = &data[i];
            void test(T)()
            {
                T v;
                // populate v` with data
                ubyte* ptrToV = cast(ubyte*)&v;
                foreach (j; 0..T.sizeof)
                {
                    ptrToV[j] = cast(ubyte)j;
                }
                // store `v` to location pointed to by `d`
                storeUnaligned(cast(T*)d, v);
                // check that the data was stored correctly
                foreach (j; 0..T.sizeof)
                {
                    assert(ptrToV[j] == d[j]);
                }
            }
            test!void16();
            test!byte16();
            test!ubyte16();
            test!short8();
            test!ushort8();
            test!int4();
            test!uint4();
            test!long2();
            test!ulong2();
            test!double2();
            test!float4();
        }
    }
}