+ // The inline assembly below was tested on an ARMv8 64 bit platform on
+ // 2015-02-06 and found to run in 11.8 nanoseconds, whereas
+ // __sync_add_and_fetch( address, 1 ) required 18.8 nanoseconds.
+ // Including the assembly here produced one fewer assembly instruction than if
+ // it was wrapped in a function and inlined here by the compiler.
+#if defined __aarch64__
+
+ asm volatile(
+ "1:\tldxr w1, %[address] \n\t"
+ "add w1, w1, %[one] \n\t"
+ "stxr w2, w1, %[address] \n\t"
+ "cbnz w2, 1b \n\t"
+ // Outputs:
+ : // Q = A memory address with no offset
+ // ( https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html#Machine-Constraints )
+ [address] "+Q" (mCount)
+ // Inputs:
+ : [one] "Ir" (1)
+ // Clobbers: (explicitly clobber w1 register to hold the loaded value and
+ // register w2 to hold success/fail):
+ : "w1", "w2"
+ );
+
+ // 32 bit ARMv7 version of above:
+ // Better than the code emitted by GCC for __sync_add_and_fetch(), as that
+ // includes two dmb memory barrier instructions: one before and one after the
+ // loop.
+#elif defined __arm__
+
+ asm volatile(
+ "1:\tldrex r1, %[address] \n\t"
+ "add r1, r1, %[one] \n\t"
+ "strex r2, r1, %[address] \n\t"
+ "teq r2, %[zero] \n\t"
+ "bne 1b \n\t"
+ // Outputs:
+ : [address] "+Q" (mCount)
+ // Inputs:
+ : [zero] "Ir" (0),
+ [one] "Ir" (1)
+ // Clobbers: (modified registers):
+ : "r1", "r2"
+ );
+
+#else
+
+ // gcc > 4.1 builtin atomic add and fetch:
+ __sync_add_and_fetch( &mCount, 1 );
+
+#endif