/**
 * The Atomic module is intended to provide some basic support for the so called lock-free
 * concurrent programming.
 * The current design replaces the previous Atomic module by Sean and is inspired
 * partly by the llvm atomic operations
 *
 * If no atomic ops are available an (inefficent) fallback solution is provided
 *
 * If you want unique counters or flags to communicate in multithreading settings
 * look at tango.core.sync.Counter that provides them in a better way and handles
 * better the absence of atomic ops
 *
 * Copyright: Copyright (C) 2009. Fawzi Mohamed All rights reserved.
 * License:   BSD style & AFL: $(LICENSE)
 * Authors:   Fawzi Mohamed
 */
module tango.core.sync.Atomic;
import tango.core.Traits;

version( LDC )
{
    import ldc.intrinsics;
}

private extern(C) void thread_yield();

// NOTE: Strictly speaking, the x86 supports atomic operations on
//       unaligned values.  However, this is far slower than the
//       common case, so such behavior should be prohibited.
private template atomicValueIsProperlyAligned( T )
{
    bool atomicValueIsProperlyAligned( size_t addr )
    {
        return addr % T.sizeof == 0;
    }
}

/// a barrier does not allow some kinds of intermixing and out of order execution
/// and ensures that all operations of one kind are executed before the operations of the other type
/// which kind of mixing are not allowed depends from the template arguments
/// These are global barriers: the whole memory is synchronized (devices excluded if device is false)
///
/// the actual barrier eforced might be stronger than the requested one
///
/// if ll is true loads before the barrier are not allowed to mix with loads after the barrier
/// if ls is true loads before the barrier are not allowed to mix with stores after the barrier
/// if sl is true stores before the barrier are not allowed to mix with loads after the barrier
/// if ss is true stores before the barrier are not allowed to mix with stores after the barrier
/// if device is true als uncached and device memory is synchronized
///
/// barriers are typically paired
/// 
/// for example if you want to ensure that all writes
/// are done before setting a flags that communicates that an objects is initialized you would
/// need memoryBarrier(false,false,false,true) before setting the flag.
/// To read that flag before reading the rest of the object you would need a
/// memoryBarrier(true,false,false,false) after having read the flag
///
/// I believe that these two barriers are called acquire and release, but you find several
/// incompatible definitions around (some obviously wrong), so some care migth be in order
/// To be safer memoryBarrier(false,true,false,true) might be used for acquire, and
/// memoryBarrier(true,false,true,false) for release which are slighlty stronger.
/// 
/// these barriers are also called write barrier and read barrier respectively.
///
/// A full memory fence is (true,true,true,true) and ensures that stores and loads before the
/// barrier are done before stores and loads after it.
/// Keep in mind even with a full barrier you still normally need two of them, to avoid that the
/// other process reorders loads (for example) and still sees things in the wrong order.
version( LDC )
{
    private void memoryBarrier(bool ll, bool ls, bool sl,bool ss,bool device=false)(){
        llvm_memory_barrier(ll,ls,sl,ss,device);
    }
} else version(D_InlineAsm_X86){
    private void memoryBarrier(bool ll, bool ls, bool sl,bool ss,bool device=false)(){
        static if (device) {
            if (ls || sl || ll || ss){
                // cpid should sequence even more than mfence
                volatile asm {
                    mov EAX, 0; // model, stepping
                    cpuid;
                }
            }
        } else static if (ls || sl || (ll && ss)){ // use a sequencing operation like cpuid or simply cmpxch instead?
            volatile asm {
                mfence;
            }
            // this is supposedly faster and correct, but let's play it safe and use the specific instruction
            // push rax
            // xchg rax
            // pop rax
        } else static if (ll){
            volatile asm {
                lfence;
            }
        } else static if( ss ){
            volatile asm {
                sfence;
            }
        }
    }
} else {
    private int dummy;
    // acquires a lock... probably you will want to skip this
    private synchronized void memoryBarrier(bool ll, bool ls, bool sl,bool ss,bool device=false)(){
        dummy=1;
    }
    package enum{LockVersion=true}
}

static if (!is(typeof(LockVersion))) {
    package enum{LockVersion=false}
}

/// atomic swap
/// val and newval in one atomic operation
/// barriers are not implied, just atomicity!
version(LDC){
    bool atomicSwap( T )( ref T val, T newval )
    {
        T oldval = void;
        static if (isPointerType!(T))
        {
            oldval = cast(T)llvm_atomic_swap!(size_t)(cast(size_t*)&val, cast(size_t)newval);
        }
        else static if (is(T == bool))
        {
            oldval = llvm_atomic_swap!(ubyte)(cast(ubyte*)&val, newval?1:0)?0:1;
        }
        else
        {
            oldval = llvm_atomic_swap!(T)(&val, newval);
        }
        return oldval;
    }
} else version(D_InlineAsm_X86) {
    T atomicSwap( T )( ref T val, T newval )
    in {
        // NOTE: 32 bit x86 systems support 8 byte CAS, which only requires
        //       4 byte alignment, so use size_t as the align type here.
        static if( T.sizeof > size_t.sizeof )
            assert( atomicValueIsProperlyAligned!(size_t)( cast(size_t) &val ) );
        else
            assert( atomicValueIsProperlyAligned!(T)( cast(size_t) &val ) );
    } body {
        T*posVal=&val;
        static if( T.sizeof == byte.sizeof ) {
            volatile asm {
                mov AL, newval;
                mov ECX, posVal;
                lock; // lock always needed to make this op atomic
                xchg [ECX], AL;
            }
        }
        else static if( T.sizeof == short.sizeof ) {
            volatile asm {
                mov AX, newval;
                mov ECX, posVal;
                lock; // lock always needed to make this op atomic
                xchg [ECX], AX;
            }
        }
        else static if( T.sizeof == int.sizeof ) {
            volatile asm {
                mov EAX, newval;
                mov ECX, posVal;
                lock; // lock always needed to make this op atomic
                xchg [ECX], EAX;
            }
        }
        else static if( T.sizeof == long.sizeof ) {
            // 8 Byte swap on 32-Bit Processor, use CAS?
            static assert( false, "Invalid template type specified, 8bytes in 32 bit mode: "~T.stringof );
        }
        else
        {
            static assert( false, "Invalid template type specified: "~T.stringof );
        }
    }
} else version (D_InlineAsm_X86_64){
    T atomicSwap( T )( ref T val, T newval )
    in {
        assert( atomicValueIsProperlyAligned!(T)( cast(size_t) &val ) );
    } body {
        T*posVal=&val;
        static if( T.sizeof == byte.sizeof ) {
            volatile asm {
                mov AL, newval;
                mov RCX, posVal;
                lock; // lock always needed to make this op atomic
                xchg [RCX], AL;
            }
        }
        else static if( T.sizeof == short.sizeof ) {
            volatile asm {
                mov AX, newval;
                mov RCX, posVal;
                lock; // lock always needed to make this op atomic
                xchg [RCX], AX;
            }
        }
        else static if( T.sizeof == int.sizeof ) {
            volatile asm {
                mov EAX, newval;
                mov RCX, posVal;
                lock; // lock always needed to make this op atomic
                xchg [RCX], EAX;
            }
        }
        else static if( T.sizeof == long.sizeof ) {
            volatile asm {
                mov RAX, newval;
                mov RCX, posVal;
                lock; // lock always needed to make this op atomic
                xchg [RCX], RAX;
            }
        }
        else
        {
            static assert( false, "Invalid template type specified: "~T.stringof );
        }
    }
} else {
    T atomicSwap( T )( ref T val, T newval )
    in {
        assert( atomicValueIsProperlyAligned!(T)( cast(size_t) &val ) );
    } body {
        T oldVal;
        synchronized(typeid(T)){
            oldVal=val;
            val=newval;
        }
        return oldVal;
    }
}

//---------------------
/// atomic compare & exchange (can be used to implement everything else)
/// stores newval into val if val==equalTo in one atomic operation
/// barriers are not implied, just atomicity!
version(LDC){
    bool atomicCAS( T )( ref T val, T newval, T equalTo )
    {
        T oldval = void;
        static if (isPointerType!(T))
        {
            oldval = cast(T)llvm_atomic_cmp_swap!(size_t)(cast(size_t*)&val, cast(size_t)equalTo, cast(size_t)newval);
        }
        else static if (is(T == bool))
        {
            oldval = llvm_atomic_cmp_swap!(ubyte)(cast(ubyte*)&val, equalTo?1:0, newval?1:0)?0:1;
        }
        else
        {
            oldval = llvm_atomic_cmp_swap!(T)(&val, equalTo, newval);
        }
        return oldval == equalTo;
    }
} else version(D_InlineAsm_X86) {
    bool atomicCAS( T )( ref T val, T newval, T equalTo )
    in {
        // NOTE: 32 bit x86 systems support 8 byte CAS, which only requires
        //       4 byte alignment, so use size_t as the align type here.
        static if( T.sizeof > size_t.sizeof )
            assert( atomicValueIsProperlyAligned!(size_t)( cast(size_t) &val ) );
        else
            assert( atomicValueIsProperlyAligned!(T)( cast(size_t) &val ) );
    } body {
        T*posVal=&val;
        static if( T.sizeof == byte.sizeof ) {
            volatile asm {
                mov DL, newval;
                mov AL, equalTo;
                mov ECX, posVal;
                lock; // lock always needed to make this op atomic
                cmpxchg [ECX], DL;
                setz AL;
            }
        }
        else static if( T.sizeof == short.sizeof ) {
            volatile asm {
                mov DX, newval;
                mov AX, equalTo;
                mov ECX, posVal;
                lock; // lock always needed to make this op atomic
                cmpxchg [ECX], DX;
                setz AL;
            }
        }
        else static if( T.sizeof == int.sizeof ) {
            volatile asm {
                mov EDX, newval;
                mov EAX, equalTo;
                mov ECX, posVal;
                lock; // lock always needed to make this op atomic
                cmpxchg [ECX], EDX;
                setz AL;
            }
        }
        else static if( T.sizeof == long.sizeof ) {
            // 8 Byte StoreIf on 32-Bit Processor
            version(darwin){
                return OSAtomicCompareAndSwap64(cast(long)equalTo, cast(long)newval,  cast(long*)&val);
            } else {
                volatile asm
                {
                    push EDI;
                    push EBX;
                    lea EDI, newval;
                    mov EBX, [EDI];
                    mov ECX, 4[EDI];
                    lea EDI, equalTo;
                    mov EAX, [EDI];
                    mov EDX, 4[EDI];
                    mov EDI, val;
                    lock; // lock always needed to make this op atomic
                    cmpxch8b [EDI];
                    setz AL;
                    pop EBX;
                    pop EDI;
                }
            }
        }
        else
        {
            static assert( false, "Invalid template type specified: "~T.stringof );
        }
    }
} else version (D_InlineAsm_X86_64){
    bool atomicCAS( T )( ref T val, T newval, T equalTo )
    in {
        assert( atomicValueIsProperlyAligned!(T)( cast(size_t) &val ) );
    } body {
        T*posVal=&val;
        static if( T.sizeof == byte.sizeof ) {
            volatile asm {
                mov DL, newval;
                mov AL, equalTo;
                mov RCX, posVal;
                lock; // lock always needed to make this op atomic
                cmpxchg [RCX], DL;
                setz AL;
            }
        }
        else static if( T.sizeof == short.sizeof ) {
            volatile asm {
                mov DX, newval;
                mov AX, equalTo;
                mov RCX, posVal;
                lock; // lock always needed to make this op atomic
                cmpxchg [RCX], DX;
                setz AL;
            }
        }
        else static if( T.sizeof == int.sizeof ) {
            volatile asm {
                mov EDX, newval;
                mov EAX, equalTo;
                mov RCX, posVal;
                lock; // lock always needed to make this op atomic
                cmpxchg [RCX], EDX;
                setz AL;
            }
        }
        else static if( T.sizeof == long.sizeof ) {
            volatile asm {
                mov RDX, newval;
                mov RAX, equalTo;
                mov RCX, posVal;
                lock; // lock always needed to make this op atomic
                cmpxchg [RCX], RDX;
                setz AL;
            }
        }
        else
        {
            static assert( false, "Invalid template type specified: "~T.stringof );
        }
    }
} else {
    bool atomicCAS( T )( ref T val, T newval, T equalTo )
    in {
            assert( atomicValueIsProperlyAligned!(T)( cast(size_t) &val ) );
    } body {
        synchronized(typeid(T)){
            if(val==equalTo) {
                val=newval;
                return true;
            }
        }
        return false;
    }
}


/// loads a value from memory
///
/// at the moment it is assumed that all aligned memory accesses are atomic
/// in the sense that all bits are consistent with some store
///
/// remove this? I know no actual architecture where this would be different
T atomicLoad(T)(ref T val)
in {
        assert( atomicValueIsProperlyAligned!(T)( cast(size_t) &val ) );
        static assert(T.sizeof<=size_t.sizeof);
} body {
    volatile res=val;
    return res;
}

/// stores a value the the memory
///
/// at the moment it is assumed that all aligned memory accesses are atomic
/// in the sense that a load either sees the complete store or the previous value
///
/// remove this? I know no actual architecture where this would be different
T atomicStore(T)(ref T val, T newVal)
in {
        assert( atomicValueIsProperlyAligned!(T)( cast(size_t) &val ) );
        static assert(T.sizeof<=size_t.sizeof);
} body {
    volatile newVal=val;
}

/// increments the given value and returns the previous value with an atomic operation
///
/// some architectures might allow just increments/decrements by 1
///
/// no barriers implied, only atomicity!
version(LDC){
    T atomicAdd(T)(ref T val, T inc){
        static assert( isIntegerType!(T) );
        static if (isPointerType!(T))
        {
            llvm_atomic_load_add!(size_t)(cast(size_t*)&val, inc);
        }
        else
        {
            llvm_atomic_load_add!(T)(&val, cast(T)inc);
        }
        return val;
    }
} else version (D_InlineAsm_X86){
    T atomicAdd(T)(ref T val, T incV){
        static assert( isIntegerType!(T) );
        static if (isPointerType!(T))
        {
            llvm_atomic_load_add!(size_t)(cast(size_t*)&val, incV);
        }
        else
        {
            T* posVal=&val;
            T res;
            static if (T.sizeof==1){
                volatile asm {
                    mov BL, incV;
                    mov ECX, posVal;
                    lock;
                    xadd byte ptr [ECX],BL;
                    mov byte ptr res[EBP],BL;
                }
            } else static if (T.sizeof==2){
                volatile asm {
                    mov BX, incV;
                    mov ECX, posVal;
                    lock;
                    xadd short ptr [ECX],BX;
                    mov short ptr res[EBP],BX;
                }
            } else static if (T.sizeof==4){
                volatile asm
                {
                    mov EDX, incV;
                    mov ECX, posVal;
                    lock;
                    xadd int ptr [ECX],EDX;
                    mov int ptr res[EBP],EDX;
                }
            } else static if (T.sizeof==8){
                // try to get it through CAS and a loop? is the load of 8 byte atomic?
                static assert(0,"Unsupported type size 8 in 32 bit mode");
            } else {
                static assert(0,"Unsupported type size");
            }
        }
        return res;
    }
} else version (D_InlineAsm_X86_64){
    T atomicAdd(T)(ref T val, T incV){
        static assert( isIntegerType!(T) );
        static if (isPointerType!(T))
        {
            llvm_atomic_load_add!(size_t)(cast(size_t*)&val, incV);
        }
        else
        {
            T* posVal=&val;
            T res;
            static if (T.sizeof==1){
                volatile asm {
                    mov BL, incV;
                    mov RCX, posVal;
                    lock;
                    xadd byte ptr [RCX],BL;
                    mov byte ptr res[EBP],BL;
                }
            } else static if (T.sizeof==2){
                volatile asm {
                    mov BX, incV;
                    mov RCX, posVal;
                    lock;
                    xadd short ptr [RCX],BX;
                    mov short ptr res[EBP],BX;
                }
            } else static if (T.sizeof==4){
                volatile asm
                {
                    mov EDX, incV;
                    mov RCX, posVal;
                    lock;
                    xadd int ptr [RCX],EDX;
                    mov int ptr res[EBP],EDX;
                }
            } else static if (T.sizeof==8){
                volatile asm
                {
                    mov RAX, val;
                    mov RBX, incV;
                    lock; // lock always needed to make this op atomic
                    xadd qword ptr [RAX],RBX;
                    mov res[EBP],RAX;
                }
            } else {
                static assert(0,"Unsupported type size for type:"~T.stringof);
            }
        }
        return res;
    }
} else {
    static if (LockVersion){
        T atomicAdd(T)(ref T val, T incV){
            static assert( isIntegerType!(T) );
            synchronized(typeid(T)){
                T oldV=val;
                val+=incV;
                return oldV;
            }
        }
    } else {
        T atomicAdd(T)(ref T val, T incV){
            static assert( isIntegerType!(T) );
            synchronized(typeid(T)){
                T oldV,newVal;
                do{
                    volatile oldV=val;
                    newV=oldV+incV;
                } while(!atomicCAS!(T)(val,newV,oldV))
                return oldV;
            }
        }
    }
}

/// applies a pure function atomically
/// the function should be pure as it might be called several times to ensure atomicity
/// the function should take a short time to compute otherwise contention is possible
/// and no "fair" share is applied between fast function (more likely to succeed) and
/// the others (i.e. do not use this in case of high contention)
T atomicOp(T)(ref T val, T delegate(T) f){
    static assert( isIntegerType!(T) );
    synchronized(typeid(T)){
        T oldV,newV;
        int i=0;
        bool success;
        do{
            volatile oldV=val;
            newV=f(oldV);
            success=atomicCAS!(T)(val,newV,oldV);
        } while((!success) && ++i<200)
        while (!success){
            thread_yield();
            volatile oldV=val;
            newV=f(oldV);
            success=atomicCAS!(T)(val,newV,oldV);
        }
        return oldV;
    }
}

// use stricter fences
enum{strictFences=false}

/// reads a flag (ensuring that other accesses can not happen before you read it)
T flagGet(T)(ref T flag){
    T res;
    volatile res=flag;
    memoryBarrier!(true,false,strictFences,false)();
    return res;
}

/// sets a flag (ensuring that all pending writes are executed before this)
/// the original value is returned
T flagSet(T)(ref T flag, T newVal){
    memoryBarrier!(false,strictFences,false,true)();
    return atomicSwap(flag,newVal);
}

/// writes a flag (ensuring that all pending writes are executed before this)
/// the original value is returned
T flagOp(T)(ref T flag,T delegate(T) op){
    memoryBarrier!(false,strictFences,false,true)();
    return atomicOp(flag,op);
}

/// reads a flag (ensuring that all pending writes are executed before this)
T flagAdd(T)(ref T flag,T incV=cast(T)1){
    static if (!LockVersion)
        memoryBarrier!(false,strictFences,false,true)();
    return atomicAdd(flag,incV);
}

/// returns the value of val and increments it in one atomic operation
/// useful for counters, and to generate unique values (fast)
/// no barriers are implied
T nextValue(T)(ref T val){
    return atomicAdd(val,cast(T)1);
}



debug (Atomic)
{
        void main()
        {
                int i;
                flagSet (i, 1);
                auto x = flagGet (i);
                x = flagOp (i, (int i){return i;});
                x = flagAdd (i, 1);
                x = flagAdd (i,-1);
                x = nextValue(i);
        }
}
