src/nxt_atomic.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265


/*
 * Copyright (C) Igor Sysoev
 * Copyright (C) NGINX, Inc.
 */

#ifndef _NXT_ATOMIC_H_INCLUDED_
#define _NXT_ATOMIC_H_INCLUDED_


/*
 * nxt_atomic_try_lock() must set an acquire barrier on lock.
 * nxt_atomic_xchg() must set an acquire barrier.
 * nxt_atomic_release() must set a release barrier.
 */

#if (NXT_HAVE_GCC_ATOMIC) /* GCC 4.1 builtin atomic operations */

typedef intptr_t                    nxt_atomic_int_t;
typedef uintptr_t                   nxt_atomic_uint_t;
typedef volatile nxt_atomic_uint_t  nxt_atomic_t;

/*
 * __sync_bool_compare_and_swap() is a full barrier.
 * __sync_lock_test_and_set() is an acquire barrier.
 * __sync_lock_release() is a release barrier.
 */

#define nxt_atomic_cmp_set(lock, cmp, set)                                    \
    __sync_bool_compare_and_swap(lock, cmp, set)


#define nxt_atomic_xchg(lock, set)                                            \
    __sync_lock_test_and_set(lock, set)


#define nxt_atomic_fetch_add(value, add)                                      \
    __sync_fetch_and_add(value, add)


#define nxt_atomic_try_lock(lock)                                             \
    nxt_atomic_cmp_set(lock, 0, 1)


#define nxt_atomic_release(lock)                                              \
    __sync_lock_release(lock)


#define nxt_atomic_or_fetch(ptr, val)                                         \
    __sync_or_and_fetch(ptr, val)


#define nxt_atomic_and_fetch(ptr, val)                                        \
    __sync_and_and_fetch(ptr, val)


#if (__i386__ || __i386 || __amd64__ || __amd64)
#define nxt_cpu_pause()                                                       \
    __asm__ ("pause")

#else
#define nxt_cpu_pause()
#endif


#elif (NXT_HAVE_SOLARIS_ATOMIC) /* Solaris 10 */

#include <atomic.h>

typedef long                        nxt_atomic_int_t;
typedef ulong_t                     nxt_atomic_uint_t;
typedef volatile nxt_atomic_uint_t  nxt_atomic_t;


#define nxt_atomic_cmp_set(lock, cmp, set)                                    \
    (atomic_cas_ulong(lock, cmp, set) == (ulong_t) cmp)


#define nxt_atomic_xchg(lock, set)                                            \
    atomic_add_swap(lock, set)


#define nxt_atomic_fetch_add(value, add)                                      \
    (atomic_add_long_nv(value, add) - add)


#define nxt_atomic_or_fetch(ptr, val)                                         \
    atomic_or_ulong_nv(ptr, val)


#define nxt_atomic_and_fetch(ptr, val)                                        \
    atomic_and_ulong_nv(ptr, val)


/*
 * Solaris uses SPARC Total Store Order model.  In this model:
 * 1) Each atomic load-store instruction behaves as if it were followed by
 *    #LoadLoad, #LoadStore, and #StoreStore barriers.
 * 2) Each load instruction behaves as if it were followed by
 *    #LoadLoad and #LoadStore barriers.
 * 3) Each store instruction behaves as if it were followed by
 *    #StoreStore barrier.
 *
 * In X86_64 atomic instructions set a full barrier and usual instructions
 * set implicit #LoadLoad, #LoadStore, and #StoreStore barriers.
 *
 * An acquire barrier requires at least #LoadLoad and #LoadStore barriers
 * and they are provided by atomic load-store instruction.
 *
 * A release barrier requires at least #LoadStore and #StoreStore barriers,
 * so a lock release does not require an explicit barrier: all load
 * instructions in critical section is followed by implicit #LoadStore
 * barrier and all store instructions are followed by implicit #StoreStore
 * barrier.
 */

#define nxt_atomic_try_lock(lock)                                             \
    nxt_atomic_cmp_set(lock, 0, 1)


#define nxt_atomic_release(lock)                                              \
    *lock = 0;


/*
 * The "rep; nop" is used instead of "pause" to omit the "[ PAUSE ]" hardware
 * capability added by linker since Solaris ld.so.1 does not know about it:
 *
 *   ld.so.1: ...: fatal: hardware capability unsupported: 0x2000 [ PAUSE ]
 */

#if (__i386__ || __i386 || __amd64__ || __amd64)
#define nxt_cpu_pause()                                                       \
    __asm__ ("rep; nop")

#else
#define nxt_cpu_pause()
#endif


/* elif (NXT_HAVE_MACOSX_ATOMIC) */

/*
 * The atomic(3) interface has been introduced in MacOS 10.4 (Tiger) and
 * extended in 10.5 (Leopard).  However its support is omitted because:
 *
 * 1) the interface is still incomplete:
 *    *) there are OSAtomicAdd32Barrier() and OSAtomicAdd64Barrier()
 *       but no OSAtomicAddLongBarrier();
 *    *) there is no interface for XCHG operation.
 *
 * 2) the interface is tuned for non-SMP systems due to omission of the
 *    LOCK prefix on single CPU system but nowadays MacOSX systems are at
 *    least dual core.  Thus these indirect calls just add overhead as
 *    compared with inlined atomic operations which are supported by GCC
 *    and Clang in modern MacOSX systems.
 */


#elif (NXT_HAVE_XLC_ATOMIC) /* XL C/C++ V8.0 for AIX */

#if (NXT_64BIT)

typedef long                        nxt_atomic_int_t;
typedef unsigned long               nxt_atomic_uint_t;
typedef volatile nxt_atomic_int_t   nxt_atomic_t;


nxt_inline nxt_bool_t
nxt_atomic_cmp_set(nxt_atomic_t *lock, nxt_atomic_int_t cmp,
    nxt_atomic_int_t set)
{
    nxt_atomic_int_t  old;

    old = cmp;

    return __compare_and_swaplp(lock, &old, set);
}


#define nxt_atomic_xchg(lock, set)                                            \
    __fetch_and_swaplp(lock, set)


#define nxt_atomic_fetch_add(value, add)                                      \
    __fetch_and_addlp(value, add)


#else /* NXT_32BIT */

typedef int                         nxt_atomic_int_t;
typedef unsigned int                nxt_atomic_uint_t;
typedef volatile nxt_atomic_int_t   nxt_atomic_t;


nxt_inline nxt_bool_t
nxt_atomic_cmp_set(nxt_atomic_t *lock, nxt_atomic_int_t cmp,
    nxt_atomic_int_t set)
{
    nxt_atomic_int_t  old;

    old = cmp;

    return __compare_and_swap(lock, &old, set);
}


#define nxt_atomic_xchg(lock, set)                                            \
    __fetch_and_swap(lock, set)


#define nxt_atomic_fetch_add(value, add)                                      \
    __fetch_and_add(value, add)


#endif /* NXT_32BIT*/


/*
 * __lwsync() is a "lwsync" instruction that sets #LoadLoad, #LoadStore,
 * and #StoreStore barrier.
 *
 * __compare_and_swap() is a pair of "ldarx" and "stdcx" instructions.
 * A "lwsync" does not set #StoreLoad barrier so it can not be used after
 * this pair since a next load inside critical section can be performed
 * after the "ldarx" instruction but before the "stdcx" instruction.
 * However, this next load instruction will load correct data because
 * otherwise the "ldarx/stdcx" pair will fail and this data will be
 * discarded.  Nevertheless, the "isync" instruction is used for sure.
 *
 * A full barrier can be set with __sync(), a "sync" instruction, but there
 * is also a faster __isync(), an "isync" instruction.  This instruction is
 * not a memory barrier but an instruction barrier.  An "isync" instruction
 * causes the processor to complete execution of all previous instructions
 * and then to discard instructions (which may have begun execution) following
 * the "isync".  After the "isync" is executed, the following instructions
 * then begin execution.  The "isync" is used to ensure that the loads
 * following entry into a critical section are not performed (because of
 * aggressive out-of-order or speculative execution in the processor) until
 * the lock is granted.
 */

nxt_inline nxt_bool_t
nxt_atomic_try_lock(nxt_atomic_t *lock)
{
    if (nxt_atomic_cmp_set(lock, 0, 1)) {
        __isync();
        return 1;
    }

    return 0;
}


#define nxt_atomic_release(lock)                                              \
    do { __lwsync(); *lock = 0; } while (0)


#define nxt_cpu_pause()


#endif /* NXT_HAVE_XLC_ATOMIC */


#endif /* _NXT_ATOMIC_H_INCLUDED_ */