/**
* The core.internal.atomic module comtains the low-level atomic features available in hardware.
* This module may be a routing layer for compiler intrinsics.
*
* Copyright: Copyright Manu Evans 2019.
* License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
* Authors:   Sean Kelly, Alex Rønne Petersen, Manu Evans
* Source:    $(DRUNTIMESRC core/internal/_atomic.d)
*/
module core.internal.atomic;
import core.atomic : MemoryOrder, has128BitCAS;
version (DigitalMars)
{
    private
    {
        enum : int
        {
            AX, BX, CX, DX, DI, SI, R8, R9
        }
        immutable string[4][8] registerNames = [
            [ "AL", "AX", "EAX", "RAX" ],
            [ "BL", "BX", "EBX", "RBX" ],
            [ "CL", "CX", "ECX", "RCX" ],
            [ "DL", "DX", "EDX", "RDX" ],
            [ "DIL", "DI", "EDI", "RDI" ],
            [ "SIL", "SI", "ESI", "RSI" ],
            [ "R8B", "R8W", "R8D", "R8" ],
            [ "R9B", "R9W", "R9D", "R9" ],
        ];
        template RegIndex(T)
        {
            static if (T.sizeof == 1)
                enum RegIndex = 0;
            else static if (T.sizeof == 2)
                enum RegIndex = 1;
            else static if (T.sizeof == 4)
                enum RegIndex = 2;
            else static if (T.sizeof == 8)
                enum RegIndex = 3;
            else
                static assert(false, "Invalid type");
        }
        enum SizedReg(int reg, T = size_t) = registerNames[reg][RegIndex!T];
    }
    inout(T) atomicLoad(MemoryOrder order = MemoryOrder.seq, T)(inout(T)* src) pure nothrow @nogc @trusted
        if (CanCAS!T)
    {
        static assert(order != MemoryOrder.rel, "invalid MemoryOrder for atomicLoad()");
        static if (T.sizeof == size_t.sizeof * 2)
        {
            version (D_InlineAsm_X86)
            {
                asm pure nothrow @nogc @trusted
                {
                    push EDI;
                    push EBX;
                    mov EBX, 0;
                    mov ECX, 0;
                    mov EAX, 0;
                    mov EDX, 0;
                    mov EDI, src;
                    lock; cmpxchg8b [EDI];
                    pop EBX;
                    pop EDI;
                }
            }
            else version (D_InlineAsm_X86_64)
            {
                version (Windows)
                {
                    static if (RegisterReturn!T)
                    {
                        enum SrcPtr = SizedReg!CX;
                        enum RetPtr = null;
                    }
                    else
                    {
                        enum SrcPtr = SizedReg!DX;
                        enum RetPtr = SizedReg!CX;
                    }
                    mixin (simpleFormat(q{
                        asm pure nothrow @nogc @trusted
                        {
                            naked;
                            push RBX;
                            mov R8, %0;
    ?1                        mov R9, %1;
                            mov RBX, 0;
                            mov RCX, 0;
                            mov RAX, 0;
                            mov RDX, 0;
                            lock; cmpxchg16b [R8];
    ?1                        mov [R9], RAX;
    ?1                        mov 8[R9], RDX;
                            pop RBX;
                            ret;
                        }
                    }, [SrcPtr, RetPtr]));
                }
                else
                {
                    asm pure nothrow @nogc @trusted
                    {
                        naked;
                        push RBX;
                        mov RBX, 0;
                        mov RCX, 0;
                        mov RAX, 0;
                        mov RDX, 0;
                        lock; cmpxchg16b [RDI];
                        pop RBX;
                        ret;
                    }
                }
            }
        }
        else static if (needsLoadBarrier!order)
        {
            version (D_InlineAsm_X86)
            {
                enum SrcReg = SizedReg!CX;
                enum ZeroReg = SizedReg!(DX, T);
                enum ResReg = SizedReg!(AX, T);
                mixin (simpleFormat(q{
                    asm pure nothrow @nogc @trusted
                    {
                        mov %1, 0;
                        mov %2, 0;
                        mov %0, src;
                        lock; cmpxchg [%0], %1;
                    }
                }, [SrcReg, ZeroReg, ResReg]));
            }
            else version (D_InlineAsm_X86_64)
            {
                version (Windows)
                    enum SrcReg = SizedReg!CX;
                else
                    enum SrcReg = SizedReg!DI;
                enum ZeroReg = SizedReg!(DX, T);
                enum ResReg = SizedReg!(AX, T);
                mixin (simpleFormat(q{
                    asm pure nothrow @nogc @trusted
                    {
                        naked;
                        mov %1, 0;
                        mov %2, 0;
                        lock; cmpxchg [%0], %1;
                        ret;
                    }
                }, [SrcReg, ZeroReg, ResReg]));
            }
        }
        else
            return *src;
    }
    void atomicStore(MemoryOrder order = MemoryOrder.seq, T)(T* dest, T value) pure nothrow @nogc @trusted
        if (CanCAS!T)
    {
        static assert(order != MemoryOrder.acq, "Invalid MemoryOrder for atomicStore()");
        static if (T.sizeof == size_t.sizeof * 2)
        {
            version (D_InlineAsm_X86)
            {
                asm pure nothrow @nogc @trusted
                {
                    push EDI;
                    push EBX;
                    lea EDI, value;
                    mov EBX, [EDI];
                    mov ECX, 4[EDI];
                    mov EDI, dest;
                    mov EAX, [EDI];
                    mov EDX, 4[EDI];
                L1: lock; cmpxchg8b [EDI];
                    jne L1;
                    pop EBX;
                    pop EDI;
                }
            }
            else version (D_InlineAsm_X86_64)
            {
                version (Windows)
                {
                    asm pure nothrow @nogc @trusted
                    {
                        naked;
                        push RBX;
                        mov R8, RDX;
                        mov RAX, [RDX];
                        mov RDX, 8[RDX];
                        mov RBX, [RCX];
                        mov RCX, 8[RCX];
                    L1: lock; cmpxchg16b [R8];
                        jne L1;
                        pop RBX;
                        ret;
                    }
                }
                else
                {
                    asm pure nothrow @nogc @trusted
                    {
                        naked;
                        push RBX;
                        mov RBX, RDI;
                        mov RCX, RSI;
                        mov RDI, RDX;
                        mov RAX, [RDX];
                        mov RDX, 8[RDX];
                    L1: lock; cmpxchg16b [RDI];
                        jne L1;
                        pop RBX;
                        ret;
                    }
                }
            }
        }
        else static if (needsStoreBarrier!order)
            atomicExchange!(order, false)(dest, value);
        else
            *dest = value;
    }
    T atomicFetchAdd(MemoryOrder order = MemoryOrder.seq, bool result = true, T)(T* dest, T value) pure nothrow @nogc @trusted
        if (is(T : ulong))
    {
        version (D_InlineAsm_X86)
        {
            static assert(T.sizeof <= 4, "64bit atomicFetchAdd not supported on 32bit target." );
            enum DestReg = SizedReg!DX;
            enum ValReg = SizedReg!(AX, T);
            mixin (simpleFormat(q{
                asm pure nothrow @nogc @trusted
                {
                    mov %1, value;
                    mov %0, dest;
                    lock; xadd[%0], %1;
                }
            }, [DestReg, ValReg]));
        }
        else version (D_InlineAsm_X86_64)
        {
            version (Windows)
            {
                enum DestReg = SizedReg!DX;
                enum ValReg = SizedReg!(CX, T);
            }
            else
            {
                enum DestReg = SizedReg!SI;
                enum ValReg = SizedReg!(DI, T);
            }
            enum ResReg = result ? SizedReg!(AX, T) : null;
            mixin (simpleFormat(q{
                asm pure nothrow @nogc @trusted
                {
                    naked;
                    lock; xadd[%0], %1;
    ?2                mov %2, %1;
                    ret;
                }
            }, [DestReg, ValReg, ResReg]));
        }
        else
            static assert (false, "Unsupported architecture.");
    }
    T atomicFetchSub(MemoryOrder order = MemoryOrder.seq, bool result = true, T)(T* dest, T value) pure nothrow @nogc @trusted
        if (is(T : ulong))
    {
        return atomicFetchAdd(dest, cast(T)-cast(IntOrLong!T)value);
    }
    T atomicExchange(MemoryOrder order = MemoryOrder.seq, bool result = true, T)(T* dest, T value) pure nothrow @nogc @trusted
    if (CanCAS!T)
    {
        version (D_InlineAsm_X86)
        {
            static assert(T.sizeof <= 4, "64bit atomicExchange not supported on 32bit target." );
            enum DestReg = SizedReg!CX;
            enum ValReg = SizedReg!(AX, T);
            mixin (simpleFormat(q{
                asm pure nothrow @nogc @trusted
                {
                    mov %1, value;
                    mov %0, dest;
                    xchg [%0], %1;
                }
            }, [DestReg, ValReg]));
        }
        else version (D_InlineAsm_X86_64)
        {
            version (Windows)
            {
                enum DestReg = SizedReg!DX;
                enum ValReg = SizedReg!(CX, T);
            }
            else
            {
                enum DestReg = SizedReg!SI;
                enum ValReg = SizedReg!(DI, T);
            }
            enum ResReg = result ? SizedReg!(AX, T) : null;
            mixin (simpleFormat(q{
                asm pure nothrow @nogc @trusted
                {
                    naked;
                    xchg [%0], %1;
    ?2                mov %2, %1;
                    ret;
                }
            }, [DestReg, ValReg, ResReg]));
        }
        else
            static assert (false, "Unsupported architecture.");
    }
    alias atomicCompareExchangeWeak = atomicCompareExchangeStrong;
    bool atomicCompareExchangeStrong(MemoryOrder succ = MemoryOrder.seq, MemoryOrder fail = MemoryOrder.seq, T)(T* dest, T* compare, T value) pure nothrow @nogc @trusted
        if (CanCAS!T)
    {
        version (D_InlineAsm_X86)
        {
            static if (T.sizeof <= 4)
            {
                enum DestAddr = SizedReg!CX;
                enum CmpAddr = SizedReg!DI;
                enum Val = SizedReg!(DX, T);
                enum Cmp = SizedReg!(AX, T);
                mixin (simpleFormat(q{
                    asm pure nothrow @nogc @trusted
                    {
                        push %1;
                        mov %2, value;
                        mov %1, compare;
                        mov %3, [%1];
                        mov %0, dest;
                        lock; cmpxchg [%0], %2;
                        mov [%1], %3;
                        setz AL;
                        pop %1;
                    }
                }, [DestAddr, CmpAddr, Val, Cmp]));
            }
            else static if (T.sizeof == 8)
            {
                asm pure nothrow @nogc @trusted
                {
                    push EDI;
                    push EBX;
                    lea EDI, value;
                    mov EBX, [EDI];
                    mov ECX, 4[EDI];
                    mov EDI, compare;
                    mov EAX, [EDI];
                    mov EDX, 4[EDI];
                    mov EDI, dest;
                    lock; cmpxchg8b [EDI];
                    mov EDI, compare;
                    mov [EDI], EAX;
                    mov 4[EDI], EDX;
                    setz AL;
                    pop EBX;
                    pop EDI;
                }
            }
            else
                static assert(T.sizeof <= 8, "128bit atomicCompareExchangeStrong not supported on 32bit target." );
        }
        else version (D_InlineAsm_X86_64)
        {
            static if (T.sizeof <= 8)
            {
                version (Windows)
                {
                    enum DestAddr = SizedReg!R8;
                    enum CmpAddr = SizedReg!DX;
                    enum Val = SizedReg!(CX, T);
                }
                else
                {
                    enum DestAddr = SizedReg!DX;
                    enum CmpAddr = SizedReg!SI;
                    enum Val = SizedReg!(DI, T);
                }
                enum Res = SizedReg!(AX, T);
                mixin (simpleFormat(q{
                    asm pure nothrow @nogc @trusted
                    {
                        naked;
                        mov %3, [%1];
                        lock; cmpxchg [%0], %2;
                        jne compare_fail;
                        mov AL, 1;
                        ret;
                    compare_fail:
                        mov [%1], %3;
                        xor AL, AL;
                        ret;
                    }
                }, [DestAddr, CmpAddr, Val, Res]));
            }
            else
            {
                version (Windows)
                {
                    asm pure nothrow @nogc @trusted
                    {
                        naked;
                        push RBX;
                        mov R9, RDX;
                        mov RAX, [RDX];
                        mov RDX, 8[RDX];
                        mov RBX, [RCX];
                        mov RCX, 8[RCX];
                        lock; cmpxchg16b [R8];
                        pop RBX;
                        jne compare_fail;
                        mov AL, 1;
                        ret;
                    compare_fail:
                        mov [R9], RAX;
                        mov 8[R9], RDX;
                        xor AL, AL;
                        ret;
                    }
                }
                else
                {
                    asm pure nothrow @nogc @trusted
                    {
                        naked;
                        push RBX;
                        mov R8, RCX;
                        mov R9, RDX;
                        mov RAX, [RDX];
                        mov RDX, 8[RDX];
                        mov RBX, RDI;
                        mov RCX, RSI;
                        lock; cmpxchg16b [R8];
                        pop RBX;
                        jne compare_fail;
                        mov AL, 1;
                        ret;
                    compare_fail:
                        mov [R9], RAX;
                        mov 8[R9], RDX;
                        xor AL, AL;
                        ret;
                    }
                }
            }
        }
        else
            static assert (false, "Unsupported architecture.");
    }
    alias atomicCompareExchangeWeakNoResult = atomicCompareExchangeStrongNoResult;
    bool atomicCompareExchangeStrongNoResult(MemoryOrder succ = MemoryOrder.seq, MemoryOrder fail = MemoryOrder.seq, T)(T* dest, const T compare, T value) pure nothrow @nogc @trusted
        if (CanCAS!T)
    {
        version (D_InlineAsm_X86)
        {
            static if (T.sizeof <= 4)
            {
                enum DestAddr = SizedReg!CX;
                enum Cmp = SizedReg!(AX, T);
                enum Val = SizedReg!(DX, T);
                mixin (simpleFormat(q{
                    asm pure nothrow @nogc @trusted
                    {
                        mov %2, value;
                        mov %1, compare;
                        mov %0, dest;
                        lock; cmpxchg [%0], %2;
                        setz AL;
                    }
                }, [DestAddr, Cmp, Val]));
            }
            else static if (T.sizeof == 8)
            {
                asm pure nothrow @nogc @trusted
                {
                    push EDI;
                    push EBX;
                    lea EDI, value;
                    mov EBX, [EDI];
                    mov ECX, 4[EDI];
                    lea EDI, compare;
                    mov EAX, [EDI];
                    mov EDX, 4[EDI];
                    mov EDI, dest;
                    lock; cmpxchg8b [EDI];
                    setz AL;
                    pop EBX;
                    pop EDI;
                }
            }
            else
                static assert(T.sizeof <= 8, "128bit atomicCompareExchangeStrong not supported on 32bit target." );
        }
        else version (D_InlineAsm_X86_64)
        {
            static if (T.sizeof <= 8)
            {
                version (Windows)
                {
                    enum DestAddr = SizedReg!R8;
                    enum Cmp = SizedReg!(DX, T);
                    enum Val = SizedReg!(CX, T);
                }
                else
                {
                    enum DestAddr = SizedReg!DX;
                    enum Cmp = SizedReg!(SI, T);
                    enum Val = SizedReg!(DI, T);
                }
                enum AXReg = SizedReg!(AX, T);
                mixin (simpleFormat(q{
                    asm pure nothrow @nogc @trusted
                    {
                        naked;
                        mov %3, %1;
                        lock; cmpxchg [%0], %2;
                        setz AL;
                        ret;
                    }
                }, [DestAddr, Cmp, Val, AXReg]));
            }
            else
            {
                version (Windows)
                {
                    asm pure nothrow @nogc @trusted
                    {
                        naked;
                        push RBX;
                        mov RAX, [RDX];
                        mov RDX, 8[RDX];
                        mov RBX, [RCX];
                        mov RCX, 8[RCX];
                        lock; cmpxchg16b [R8];
                        setz AL;
                        pop RBX;
                        ret;
                    }
                }
                else
                {
                    asm pure nothrow @nogc @trusted
                    {
                        naked;
                        push RBX;
                        mov RAX, RDX;
                        mov RDX, RCX;
                        mov RBX, RDI;
                        mov RCX, RSI;
                        lock; cmpxchg16b [R8];
                        setz AL;
                        pop RBX;
                        ret;
                    }
                }
            }
        }
        else
            static assert (false, "Unsupported architecture.");
    }
    void atomicFence(MemoryOrder order = MemoryOrder.seq)() pure nothrow @nogc @trusted
    {
        // TODO: `mfence` should only be required for seq_cst operations, but this depends on
        //       the compiler's backend knowledge to not reorder code inappropriately,
        //       so we'll apply it conservatively.
        static if (order != MemoryOrder.raw)
        {
            version (D_InlineAsm_X86)
            {
                import core.cpuid;
                // TODO: review this implementation; it seems way overly complicated
                asm pure nothrow @nogc @trusted
                {
                    naked;
                    call sse2;
                    test AL, AL;
                    jne Lcpuid;
                    // Fast path: We have SSE2, so just use mfence.
                    mfence;
                    jmp Lend;
                Lcpuid:
                    // Slow path: We use cpuid to serialize. This is
                    // significantly slower than mfence, but is the
                    // only serialization facility we have available
                    // on older non-SSE2 chips.
                    push EBX;
                    mov EAX, 0;
                    cpuid;
                    pop EBX;
                Lend:
                    ret;
                }
            }
            else version (D_InlineAsm_X86_64)
            {
                asm pure nothrow @nogc @trusted
                {
                    naked;
                    mfence;
                    ret;
                }
            }
            else
                static assert (false, "Unsupported architecture.");
        }
    }
    void pause() pure nothrow @nogc @trusted
    {
        version (D_InlineAsm_X86)
        {
            asm pure nothrow @nogc @trusted
            {
                naked;
                rep; nop;
                ret;
            }
        }
        else version (D_InlineAsm_X86_64)
        {
            asm pure nothrow @nogc @trusted
            {
                naked;
    //            pause; // TODO: DMD should add this opcode to its inline asm
                rep; nop;
                ret;
            }
        }
        else
        {
            // ARM should `yield`
            // other architectures? otherwise some sort of nop...
        }
    }
}
else version (GNU)
{
    import gcc.builtins;
    import gcc.config;
    inout(T) atomicLoad(MemoryOrder order = MemoryOrder.seq, T)(inout(T)* src) pure nothrow @nogc @trusted
        if (CanCAS!T)
    {
        static assert(order != MemoryOrder.rel, "invalid MemoryOrder for atomicLoad()");
        static if (GNU_Have_Atomics || GNU_Have_LibAtomic)
        {
            static if (T.sizeof == ubyte.sizeof)
            {
                ubyte value = __atomic_load_1(cast(shared)src, order);
                return *cast(typeof(return)*)&value;
            }
            else static if (T.sizeof == ushort.sizeof)
            {
                ushort value = __atomic_load_2(cast(shared)src, order);
                return *cast(typeof(return)*)&value;
            }
            else static if (T.sizeof == uint.sizeof)
            {
                uint value = __atomic_load_4(cast(shared)src, order);
                return *cast(typeof(return)*)&value;
            }
            else static if (T.sizeof == ulong.sizeof && GNU_Have_64Bit_Atomics)
            {
                ulong value = __atomic_load_8(cast(shared)src, order);
                return *cast(typeof(return)*)&value;
            }
            else static if (GNU_Have_LibAtomic)
            {
                T value;
                __atomic_load(T.sizeof, cast(shared)src, &value, order);
                return *cast(typeof(return)*)&value;
            }
            else
                static assert(0, "Invalid template type specified.");
        }
        else
        {
            getAtomicMutex.lock();
            scope(exit) getAtomicMutex.unlock();
            return *cast(typeof(return)*)&src;
        }
    }
    void atomicStore(MemoryOrder order = MemoryOrder.seq, T)(T* dest, T value) pure nothrow @nogc @trusted
        if (CanCAS!T)
    {
        static assert(order != MemoryOrder.acq, "Invalid MemoryOrder for atomicStore()");
        static if (GNU_Have_Atomics || GNU_Have_LibAtomic)
        {
            static if (T.sizeof == ubyte.sizeof)
                __atomic_store_1(cast(shared)dest, *cast(ubyte*)&value, order);
            else static if (T.sizeof == ushort.sizeof)
                __atomic_store_2(cast(shared)dest, *cast(ushort*)&value, order);
            else static if (T.sizeof == uint.sizeof)
                __atomic_store_4(cast(shared)dest, *cast(uint*)&value, order);
            else static if (T.sizeof == ulong.sizeof && GNU_Have_64Bit_Atomics)
                __atomic_store_8(cast(shared)dest, *cast(ulong*)&value, order);
            else static if (GNU_Have_LibAtomic)
                __atomic_store(T.sizeof, cast(shared)dest, cast(void*)&value, order);
            else
                static assert(0, "Invalid template type specified.");
        }
        else
        {
            getAtomicMutex.lock();
            *dest = value;
            getAtomicMutex.unlock();
        }
    }
    T atomicFetchAdd(MemoryOrder order = MemoryOrder.seq, bool result = true, T)(T* dest, T value) pure nothrow @nogc @trusted
        if (is(T : ulong))
    {
        static if (GNU_Have_Atomics || GNU_Have_LibAtomic)
        {
            static if (T.sizeof == ubyte.sizeof)
                return __atomic_fetch_add_1(cast(shared)dest, value, order);
            else static if (T.sizeof == ushort.sizeof)
                return __atomic_fetch_add_2(cast(shared)dest, value, order);
            else static if (T.sizeof == uint.sizeof)
                return __atomic_fetch_add_4(cast(shared)dest, value, order);
            else static if (T.sizeof == ulong.sizeof && GNU_Have_64Bit_Atomics)
                return __atomic_fetch_add_8(cast(shared)dest, value, order);
            else static if (GNU_Have_LibAtomic)
                return __atomic_fetch_add(T.sizeof, cast(shared)dest, cast(void*)&value, order);
            else
                static assert(0, "Invalid template type specified.");
        }
        else
        {
            getAtomicMutex.lock();
            scope(exit) getAtomicMutex.unlock();
            T tmp = *dest;
            *dest += value;
            return tmp;
        }
    }
    T atomicFetchSub(MemoryOrder order = MemoryOrder.seq, bool result = true, T)(T* dest, T value) pure nothrow @nogc @trusted
        if (is(T : ulong))
    {
        static if (GNU_Have_Atomics || GNU_Have_LibAtomic)
        {
            static if (T.sizeof == ubyte.sizeof)
                return __atomic_fetch_sub_1(cast(shared)dest, value, order);
            else static if (T.sizeof == ushort.sizeof)
                return __atomic_fetch_sub_2(cast(shared)dest, value, order);
            else static if (T.sizeof == uint.sizeof)
                return __atomic_fetch_sub_4(cast(shared)dest, value, order);
            else static if (T.sizeof == ulong.sizeof && GNU_Have_64Bit_Atomics)
                return __atomic_fetch_sub_8(cast(shared)dest, value, order);
            else static if (GNU_Have_LibAtomic)
                return __atomic_fetch_sub(T.sizeof, cast(shared)dest, cast(void*)&value, order);
            else
                static assert(0, "Invalid template type specified.");
        }
        else
        {
            getAtomicMutex.lock();
            scope(exit) getAtomicMutex.unlock();
            T tmp = *dest;
            *dest -= value;
            return tmp;
        }
    }
    T atomicExchange(MemoryOrder order = MemoryOrder.seq, bool result = true, T)(T* dest, T value) pure nothrow @nogc @trusted
        if (is(T : ulong) || is(T == class) || is(T == interface) || is(T U : U*))
    {
        static if (GNU_Have_Atomics || GNU_Have_LibAtomic)
        {
            static if (T.sizeof == byte.sizeof)
            {
                ubyte res = __atomic_exchange_1(cast(shared)dest, *cast(ubyte*)&value, order);
                return *cast(typeof(return)*)&res;
            }
            else static if (T.sizeof == short.sizeof)
            {
                ushort res = __atomic_exchange_2(cast(shared)dest, *cast(ushort*)&value, order);
                return *cast(typeof(return)*)&res;
            }
            else static if (T.sizeof == int.sizeof)
            {
                uint res = __atomic_exchange_4(cast(shared)dest, *cast(uint*)&value, order);
                return *cast(typeof(return)*)&res;
            }
            else static if (T.sizeof == long.sizeof && GNU_Have_64Bit_Atomics)
            {
                ulong res = __atomic_exchange_8(cast(shared)dest, *cast(ulong*)&value, order);
                return *cast(typeof(return)*)&res;
            }
            else static if (GNU_Have_LibAtomic)
            {
                T res = void;
                __atomic_exchange(T.sizeof, cast(shared)dest, cast(void*)&value, &res, order);
                return res;
            }
            else
                static assert(0, "Invalid template type specified.");
        }
        else
        {
            getAtomicMutex.lock();
            scope(exit) getAtomicMutex.unlock();
            T res = *dest;
            *dest = value;
            return res;
        }
    }
    bool atomicCompareExchangeWeak(MemoryOrder succ = MemoryOrder.seq, MemoryOrder fail = MemoryOrder.seq, T)(T* dest, T* compare, T value) pure nothrow @nogc @trusted
        if (CanCAS!T)
    {
        return atomicCompareExchangeImpl!(succ, fail, true)(dest, compare, value);
    }
    bool atomicCompareExchangeStrong(MemoryOrder succ = MemoryOrder.seq, MemoryOrder fail = MemoryOrder.seq, T)(T* dest, T* compare, T value) pure nothrow @nogc @trusted
        if (CanCAS!T)
    {
        return atomicCompareExchangeImpl!(succ, fail, false)(dest, compare, value);
    }
    bool atomicCompareExchangeStrongNoResult(MemoryOrder succ = MemoryOrder.seq, MemoryOrder fail = MemoryOrder.seq, T)(T* dest, const T compare, T value) pure nothrow @nogc @trusted
        if (CanCAS!T)
    {
        return atomicCompareExchangeImpl!(succ, fail, false)(dest, cast(T*)&compare, value);
    }
    bool atomicCompareExchangeWeakNoResult(MemoryOrder succ = MemoryOrder.seq, MemoryOrder fail = MemoryOrder.seq, T)(T* dest, const T compare, T value) pure nothrow @nogc @trusted
        if (CanCAS!T)
    {
        return atomicCompareExchangeImpl!(succ, fail, true)(dest, cast(T*)&compare, value);
    }
    private bool atomicCompareExchangeImpl(MemoryOrder succ = MemoryOrder.seq, MemoryOrder fail = MemoryOrder.seq, bool weak, T)(T* dest, T* compare, T value) pure nothrow @nogc @trusted
        if (CanCAS!T)
    {
        bool res = void;
        static if (GNU_Have_Atomics || GNU_Have_LibAtomic)
        {
            static if (T.sizeof == byte.sizeof)
                res = __atomic_compare_exchange_1(cast(shared)dest, compare, *cast(ubyte*)&value,
                                                  weak, succ, fail);
            else static if (T.sizeof == short.sizeof)
                res = __atomic_compare_exchange_2(cast(shared)dest, compare, *cast(ushort*)&value,
                                                  weak, succ, fail);
            else static if (T.sizeof == int.sizeof)
                res = __atomic_compare_exchange_4(cast(shared)dest, compare, *cast(uint*)&value,
                                                  weak, succ, fail);
            else static if (T.sizeof == long.sizeof && GNU_Have_64Bit_Atomics)
                res = __atomic_compare_exchange_8(cast(shared)dest, compare, *cast(ulong*)&value,
                                                  weak, succ, fail);
            else static if (GNU_Have_LibAtomic)
                res = __atomic_compare_exchange(T.sizeof, cast(shared)dest, compare, cast(void*)&value,
                                                succ, fail);
            else
                static assert(0, "Invalid template type specified.");
        }
        else
        {
            static if (T.sizeof == byte.sizeof)
                alias U = byte;
            else static if (T.sizeof == short.sizeof)
                alias U = short;
            else static if (T.sizeof == int.sizeof)
                alias U = int;
            else static if (T.sizeof == long.sizeof)
                alias U = long;
            else
                static assert(0, "Invalid template type specified.");
            getAtomicMutex.lock();
            scope(exit) getAtomicMutex.unlock();
            if (*cast(U*)dest == *cast(U*)&compare)
            {
                *dest = value;
                res = true;
            }
            else
            {
                *compare = *dest;
                res = false;
            }
        }
        return res;
    }
    void atomicFence(MemoryOrder order = MemoryOrder.seq)() pure nothrow @nogc @trusted
    {
        static if (GNU_Have_Atomics || GNU_Have_LibAtomic)
            __atomic_thread_fence(order);
        else
        {
            getAtomicMutex.lock();
            getAtomicMutex.unlock();
        }
    }
    void pause() pure nothrow @nogc @trusted
    {
        version (X86)
        {
            __builtin_ia32_pause();
        }
        else version (X86_64)
        {
            __builtin_ia32_pause();
        }
        else
        {
            // Other architectures? Some sort of nop or barrier.
        }
    }
    static if (!GNU_Have_Atomics && !GNU_Have_LibAtomic)
    {
        // Use system mutex for atomics, faking the purity of the functions so
        // that they can be used in pure/nothrow/@safe code.
        extern (C) private pure @trusted @nogc nothrow
        {
            static if (GNU_Thread_Model == ThreadModel.Posix)
            {
                import core.sys.posix.pthread;
                alias atomicMutexHandle = pthread_mutex_t;
                pragma(mangle, "pthread_mutex_init") int fakePureMutexInit(pthread_mutex_t*, pthread_mutexattr_t*);
                pragma(mangle, "pthread_mutex_lock") int fakePureMutexLock(pthread_mutex_t*);
                pragma(mangle, "pthread_mutex_unlock") int fakePureMutexUnlock(pthread_mutex_t*);
            }
            else static if (GNU_Thread_Model == ThreadModel.Win32)
            {
                import core.sys.windows.winbase;
                alias atomicMutexHandle = CRITICAL_SECTION;
                pragma(mangle, "InitializeCriticalSection") int fakePureMutexInit(CRITICAL_SECTION*);
                pragma(mangle, "EnterCriticalSection") void fakePureMutexLock(CRITICAL_SECTION*);
                pragma(mangle, "LeaveCriticalSection") int fakePureMutexUnlock(CRITICAL_SECTION*);
            }
            else
            {
                alias atomicMutexHandle = int;
            }
        }
        // Implements lock/unlock operations.
        private struct AtomicMutex
        {
            int lock() pure @trusted @nogc nothrow
            {
                static if (GNU_Thread_Model == ThreadModel.Posix)
                {
                    if (!_inited)
                    {
                        fakePureMutexInit(&_handle, null);
                        _inited = true;
                    }
                    return fakePureMutexLock(&_handle);
                }
                else
                {
                    static if (GNU_Thread_Model == ThreadModel.Win32)
                    {
                        if (!_inited)
                        {
                            fakePureMutexInit(&_handle);
                            _inited = true;
                        }
                        fakePureMutexLock(&_handle);
                    }
                    return 0;
                }
            }
            int unlock() pure @trusted @nogc nothrow
            {
                static if (GNU_Thread_Model == ThreadModel.Posix)
                    return fakePureMutexUnlock(&_handle);
                else
                {
                    static if (GNU_Thread_Model == ThreadModel.Win32)
                        fakePureMutexUnlock(&_handle);
                    return 0;
                }
            }
        private:
            atomicMutexHandle _handle;
            bool _inited;
        }
        // Internal static mutex reference.
        private AtomicMutex* _getAtomicMutex() @trusted @nogc nothrow
        {
            __gshared static AtomicMutex mutex;
            return &mutex;
        }
        // Pure alias for _getAtomicMutex.
        pragma(mangle, _getAtomicMutex.mangleof)
        private AtomicMutex* getAtomicMutex() pure @trusted @nogc nothrow @property;
    }
}
private:
version (Windows)
{
    enum RegisterReturn(T) = is(T : U[], U) || is(T : R delegate(A), R, A...);
}
enum CanCAS(T) = is(T : ulong) ||
                 is(T == class) ||
                 is(T == interface) ||
                 is(T : U*, U) ||
                 is(T : U[], U) ||
                 is(T : R delegate(A), R, A...) ||
                 (is(T == struct) && __traits(isPOD, T) &&
                  (T.sizeof <= size_t.sizeof*2 ||       // no more than 2 words
                   (T.sizeof == 16 && has128BitCAS)) && // or supports 128-bit CAS
                  (T.sizeof & (T.sizeof - 1)) == 0      // is power of 2
                 );
template IntOrLong(T)
{
    static if (T.sizeof > 4)
        alias IntOrLong = long;
    else
        alias IntOrLong = int;
}
// NOTE: x86 loads implicitly have acquire semantics so a memory
//       barrier is only necessary on releases.
template needsLoadBarrier( MemoryOrder ms )
{
    enum bool needsLoadBarrier = ms == MemoryOrder.seq;
}
// NOTE: x86 stores implicitly have release semantics so a memory
//       barrier is only necessary on acquires.
template needsStoreBarrier( MemoryOrder ms )
{
    enum bool needsStoreBarrier = ms == MemoryOrder.seq;
}
// this is a helper to build asm blocks
string simpleFormat(string format, scope string[] args)
{
    string result;
    outer: while (format.length)
    {
        foreach (i; 0 .. format.length)
        {
            if (format[i] == '%' || format[i] == '?')
            {
                bool isQ = format[i] == '?';
                result ~= format[0 .. i++];
                assert (i < format.length, "Invalid format string");
                if (format[i] == '%' || format[i] == '?')
                {
                    assert(!isQ, "Invalid format string");
                    result ~= format[i++];
                }
                else
                {
                    int index = 0;
                    assert (format[i] >= '0' && format[i] <= '9', "Invalid format string");
                    while (i < format.length && format[i] >= '0' && format[i] <= '9')
                        index = index * 10 + (ubyte(format[i++]) - ubyte('0'));
                    if (!isQ)
                        result ~= args[index];
                    else if (!args[index])
                    {
                        size_t j = i;
                        for (; j < format.length;)
                        {
                            if (format[j++] == '\n')
                                break;
                        }
                        i = j;
                    }
                }
                format = format[i .. $];
                continue outer;
            }
        }
        result ~= format;
        break;
    }
    return result;
}