5d225ec
From a58d0ff4935ef14f32f01d4de362bba242f07e0c Mon Sep 17 00:00:00 2001
5d225ec
From: Larry Gritz <lg@larrygritz.com>
5d225ec
Date: Sat, 4 May 2013 10:22:12 -0700
5d225ec
Subject: [PATCH] spinlock tweaks that finally make it as good or better than
5d225ec
 TBB.
5d225ec
5d225ec
---
5d225ec
 src/include/thread.h                 | 89 ++++++++++++++++--------------------
5d225ec
 src/libOpenImageIO/atomic_test.cpp   |  9 ++--
5d225ec
 src/libOpenImageIO/spinlock_test.cpp | 22 +++++++--
5d225ec
 src/libtexture/imagecache_pvt.h      |  2 +-
5d225ec
 4 files changed, 62 insertions(+), 60 deletions(-)
5d225ec
5d225ec
diff --git a/src/include/thread.h b/src/include/thread.h
5d225ec
index 28645fc..2cd03c1 100644
5d225ec
--- a/src/include/thread.h
5d225ec
+++ b/src/include/thread.h
5d225ec
@@ -78,16 +78,22 @@
5d225ec
 // Some day, we hope this is all replaced by use of std::atomic<>.
5d225ec
 #if USE_TBB
5d225ec
 #  include <tbb/atomic.h>
5d225ec
-   using tbb::atomic;
5d225ec
 #  include <tbb/spin_mutex.h>
5d225ec
+#  define USE_TBB_ATOMIC 1
5d225ec
+#  define USE_TBB_SPINLOCK 1
5d225ec
+#else
5d225ec
+#  define USE_TBB_ATOMIC 0
5d225ec
+#  define USE_TBB_SPINLOCK 0
5d225ec
 #endif
5d225ec
 
5d225ec
+
5d225ec
 #if defined(_MSC_VER) && !USE_TBB
5d225ec
 #  include <windows.h>
5d225ec
 #  include <winbase.h>
5d225ec
 #  pragma intrinsic (_InterlockedExchangeAdd)
5d225ec
 #  pragma intrinsic (_InterlockedCompareExchange)
5d225ec
 #  pragma intrinsic (_InterlockedCompareExchange64)
5d225ec
+#  pragma intrinsic (_ReadWriteBarrier)
5d225ec
 #  if defined(_WIN64)
5d225ec
 #    pragma intrinsic(_InterlockedExchangeAdd64)
5d225ec
 #  endif
5d225ec
@@ -105,10 +111,6 @@
5d225ec
 #  endif
5d225ec
 #endif
5d225ec
 
5d225ec
-#ifdef __APPLE__
5d225ec
-#  include <libkern/OSAtomic.h>
5d225ec
-#endif
5d225ec
-
5d225ec
 #if defined(__GNUC__) && (defined(_GLIBCXX_ATOMIC_BUILTINS) || (__GNUC__ * 100 + __GNUC_MINOR__ >= 401))
5d225ec
 #if !defined(__FreeBSD__) || defined(__x86_64__)
5d225ec
 #define USE_GCC_ATOMICS
5d225ec
@@ -230,9 +232,6 @@ class thread_specific_ptr {
5d225ec
 #elif USE_TBB
5d225ec
     atomic<int> *a = (atomic<int> *)at;
5d225ec
     return a->fetch_and_add (x);
5d225ec
-#elif defined(no__APPLE__)
5d225ec
-    // Apple, not inline for Intel (only PPC?)
5d225ec
-    return OSAtomicAdd32Barrier (x, at) - x;
5d225ec
 #elif defined(_MSC_VER)
5d225ec
     // Windows
5d225ec
     return _InterlockedExchangeAdd ((volatile LONG *)at, x);
5d225ec
@@ -251,9 +250,6 @@ class thread_specific_ptr {
5d225ec
 #elif USE_TBB
5d225ec
     atomic<long long> *a = (atomic<long long> *)at;
5d225ec
     return a->fetch_and_add (x);
5d225ec
-#elif defined(no__APPLE__)
5d225ec
-    // Apple, not inline for Intel (only PPC?)
5d225ec
-    return OSAtomicAdd64Barrier (x, at) - x;
5d225ec
 #elif defined(_MSC_VER)
5d225ec
     // Windows
5d225ec
 #  if defined(_WIN64)
5d225ec
@@ -282,8 +278,6 @@ class thread_specific_ptr {
5d225ec
 #elif USE_TBB
5d225ec
     atomic<int> *a = (atomic<int> *)at;
5d225ec
     return a->compare_and_swap (newval, compareval) == newval;
5d225ec
-#elif defined(no__APPLE__)
5d225ec
-    return OSAtomicCompareAndSwap32Barrier (compareval, newval, at);
5d225ec
 #elif defined(_MSC_VER)
5d225ec
     return (_InterlockedCompareExchange ((volatile LONG *)at, newval, compareval) == compareval);
5d225ec
 #else
5d225ec
@@ -301,8 +295,6 @@ class thread_specific_ptr {
5d225ec
 #elif USE_TBB
5d225ec
     atomic<long long> *a = (atomic<long long> *)at;
5d225ec
     return a->compare_and_swap (newval, compareval) == newval;
5d225ec
-#elif defined(no__APPLE__)
5d225ec
-    return OSAtomicCompareAndSwap64Barrier (compareval, newval, at);
5d225ec
 #elif defined(_MSC_VER)
5d225ec
     return (_InterlockedCompareExchange64 ((volatile LONGLONG *)at, newval, compareval) == compareval);
5d225ec
 #else
5d225ec
@@ -317,9 +309,7 @@ class thread_specific_ptr {
5d225ec
 inline void
5d225ec
 yield ()
5d225ec
 {
5d225ec
-#if USE_TBB
5d225ec
-    __TBB_Yield ();
5d225ec
-#elif defined(__GNUC__)
5d225ec
+#if defined(__GNUC__)
5d225ec
     sched_yield ();
5d225ec
 #elif defined(_MSC_VER)
5d225ec
     SwitchToThread ();
5d225ec
@@ -334,12 +324,12 @@ class thread_specific_ptr {
5d225ec
 inline void
5d225ec
 pause (int delay)
5d225ec
 {
5d225ec
-#if USE_TBB
5d225ec
-    __TBB_Pause(delay);
5d225ec
-#elif defined(__GNUC__)
5d225ec
+#if defined(__GNUC__)
5d225ec
     for (int i = 0; i < delay; ++i) {
5d225ec
         __asm__ __volatile__("pause;");
5d225ec
     }
5d225ec
+#elif USE_TBB
5d225ec
+    __TBB_Pause(delay);
5d225ec
 #elif defined(_MSC_VER)
5d225ec
     for (int i = 0; i < delay; ++i) {
5d225ec
 #if defined (_WIN64)
5d225ec
@@ -369,14 +359,17 @@ class atomic_backoff {
5d225ec
             yield();
5d225ec
         }
5d225ec
     }
5d225ec
+
5d225ec
 private:
5d225ec
     int m_count;
5d225ec
 };
5d225ec
 
5d225ec
 
5d225ec
 
5d225ec
-#if (! USE_TBB)
5d225ec
-// If we're not using TBB, we need to define our own atomic<>.
5d225ec
+#if USE_TBB_ATOMIC
5d225ec
+using tbb::atomic;
5d225ec
+#else
5d225ec
+// If we're not using TBB's atomic, we need to define our own atomic<>.
5d225ec
 
5d225ec
 
5d225ec
 /// Atomic integer.  Increment, decrement, add, and subtract in a
5d225ec
@@ -456,7 +449,7 @@ class atomic {
5d225ec
 };
5d225ec
 
5d225ec
 
5d225ec
-#endif /* ! USE_TBB */
5d225ec
+#endif /* ! USE_TBB_ATOMIC */
5d225ec
 
5d225ec
 
5d225ec
 #ifdef NOTHREADS
5d225ec
@@ -478,7 +471,7 @@ class atomic {
5d225ec
 typedef null_mutex spin_mutex;
5d225ec
 typedef null_lock<spin_mutex> spin_lock;
5d225ec
 
5d225ec
-#elif USE_TBB
5d225ec
+#elif USE_TBB_SPINLOCK
5d225ec
 
5d225ec
 // Use TBB's spin locks
5d225ec
 typedef tbb::spin_mutex spin_mutex;
5d225ec
@@ -529,63 +522,61 @@ class spin_mutex {
5d225ec
     /// Acquire the lock, spin until we have it.
5d225ec
     ///
5d225ec
     void lock () {
5d225ec
-#if defined(no__APPLE__)
5d225ec
-        // OS X has dedicated spin lock routines, may as well use them.
5d225ec
-        OSSpinLockLock ((OSSpinLock *)&m_locked);
5d225ec
-#else
5d225ec
         // To avoid spinning too tightly, we use the atomic_backoff to
5d225ec
         // provide increasingly longer pauses, and if the lock is under
5d225ec
         // lots of contention, eventually yield the timeslice.
5d225ec
         atomic_backoff backoff;
5d225ec
+
5d225ec
         // Try to get ownership of the lock. Though experimentation, we
5d225ec
         // found that OIIO_UNLIKELY makes this just a bit faster on 
5d225ec
         // gcc x86/x86_64 systems.
5d225ec
         while (! OIIO_UNLIKELY(try_lock())) {
5d225ec
             do {
5d225ec
                 backoff();
5d225ec
-            } while (*(volatile int *)&m_locked);
5d225ec
+            } while (m_locked);
5d225ec
+
5d225ec
             // The full try_lock() involves a compare_and_swap, which
5d225ec
             // writes memory, and that will lock the bus.  But a normal
5d225ec
             // read of m_locked will let us spin until the value
5d225ec
             // changes, without locking the bus. So it's faster to
5d225ec
             // check in this manner until the mutex appears to be free.
5d225ec
         }
5d225ec
-#endif
5d225ec
     }
5d225ec
 
5d225ec
     /// Release the lock that we hold.
5d225ec
     ///
5d225ec
     void unlock () {
5d225ec
-#if defined(no__APPLE__)
5d225ec
-        OSSpinLockUnlock ((OSSpinLock *)&m_locked);
5d225ec
-#elif defined(__GNUC__)
5d225ec
-        // GCC gives us an intrinsic that is even better, an atomic
5d225ec
-        // assignment of 0 with "release" barrier semantics.
5d225ec
-        __sync_lock_release ((volatile int *)&m_locked);
5d225ec
+#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
5d225ec
+        // Fastest way to do it is with a store with "release" semantics
5d225ec
+        __asm__ __volatile__("": : :"memory");
5d225ec
+        m_locked = 0;
5d225ec
+        // N.B. GCC gives us an intrinsic that is even better, an atomic
5d225ec
+        // assignment of 0 with "release" barrier semantics:
5d225ec
+        //  __sync_lock_release (&m_locked);
5d225ec
+        // But empirically we found it not as performant as the above.
5d225ec
+#elif defined(_MSC_VER)
5d225ec
+        _ReadWriteBarrier();
5d225ec
+        m_locked = 0;
5d225ec
 #else
5d225ec
         // Otherwise, just assign zero to the atomic (but that's a full 
5d225ec
         // memory barrier).
5d225ec
-        m_locked = 0;
5d225ec
+        *(atomic_int *)&m_locked = 0;
5d225ec
 #endif
5d225ec
     }
5d225ec
 
5d225ec
     /// Try to acquire the lock.  Return true if we have it, false if
5d225ec
     /// somebody else is holding the lock.
5d225ec
     bool try_lock () {
5d225ec
-#if defined(no__APPLE__)
5d225ec
-        return OSSpinLockTry ((OSSpinLock *)&m_locked);
5d225ec
-#else
5d225ec
-#  if USE_TBB
5d225ec
+#if USE_TBB_ATOMIC
5d225ec
         // TBB's compare_and_swap returns the original value
5d225ec
-        return m_locked.compare_and_swap (0, 1) == 0;
5d225ec
-#  elif defined(__GNUC__)
5d225ec
+        return (*(atomic_int *)&m_locked).compare_and_swap (0, 1) == 0;
5d225ec
+#elif defined(__GNUC__)
5d225ec
         // GCC gives us an intrinsic that is even better -- an atomic
5d225ec
         // exchange with "acquire" barrier semantics.
5d225ec
-        return __sync_lock_test_and_set ((volatile int *)&m_locked, 1) == 0;
5d225ec
-#  else
5d225ec
+        return __sync_lock_test_and_set (&m_locked, 1) == 0;
5d225ec
+#else
5d225ec
         // Our compare_and_swap returns true if it swapped
5d225ec
-        return m_locked.bool_compare_and_swap (0, 1);
5d225ec
-#  endif
5d225ec
+        return atomic_compare_and_exchange (&m_locked, 0, 1);
5d225ec
 #endif
5d225ec
     }
5d225ec
 
5d225ec
@@ -603,7 +594,7 @@ class spin_mutex {
5d225ec
     };
5d225ec
 
5d225ec
 private:
5d225ec
-    atomic_int m_locked;  ///< Atomic counter is zero if nobody holds the lock
5d225ec
+    volatile int m_locked;  ///< Atomic counter is zero if nobody holds the lock
5d225ec
 };
5d225ec
 
5d225ec
 
5d225ec
diff --git a/src/libOpenImageIO/atomic_test.cpp b/src/libOpenImageIO/atomic_test.cpp
5d225ec
index 2c1e807..42d469a 100644
5d225ec
--- a/src/libOpenImageIO/atomic_test.cpp
5d225ec
+++ b/src/libOpenImageIO/atomic_test.cpp
5d225ec
@@ -49,7 +49,7 @@
5d225ec
 // and decrementing the crap out of it, and make sure it has the right
5d225ec
 // value at the end.
5d225ec
 
5d225ec
-static int iterations = 160000000;
5d225ec
+static int iterations = 40000000;
5d225ec
 static int numthreads = 16;
5d225ec
 static int ntrials = 1;
5d225ec
 static bool verbose = false;
5d225ec
@@ -184,16 +184,15 @@ int main (int argc, char *argv[])
5d225ec
 
5d225ec
     static int threadcounts[] = { 1, 2, 4, 8, 12, 16, 20, 24, 28, 32, 64, 128, 1024, 1<<30 };
5d225ec
     for (int i = 0; threadcounts[i] <= numthreads; ++i) {
5d225ec
-        int nt = threadcounts[i];
5d225ec
+        int nt = wedge ? threadcounts[i] : numthreads;
5d225ec
         int its = iterations/nt;
5d225ec
 
5d225ec
         double range;
5d225ec
         double t = time_trial (boost::bind(test_atomics,nt,its),
5d225ec
                                ntrials, &range);
5d225ec
 
5d225ec
-        std::cout << Strutil::format ("%2d\t%s\t%5.1fs, range %.1f\t(%d iters/thread)\n",
5d225ec
-                                      nt, Strutil::timeintervalformat(t),
5d225ec
-                                      t, range, its);
5d225ec
+        std::cout << Strutil::format ("%2d\t%5.1f   range %.2f\t(%d iters/thread)\n",
5d225ec
+                                      nt, t, range, its);
5d225ec
         if (! wedge)
5d225ec
             break;    // don't loop if we're not wedging
5d225ec
     }
5d225ec
diff --git a/src/libOpenImageIO/spinlock_test.cpp b/src/libOpenImageIO/spinlock_test.cpp
5d225ec
index 60c192b..64adbce 100644
5d225ec
--- a/src/libOpenImageIO/spinlock_test.cpp
5d225ec
+++ b/src/libOpenImageIO/spinlock_test.cpp
5d225ec
@@ -50,7 +50,7 @@
5d225ec
 // accumulated value is equal to iterations*threads, then the spin locks
5d225ec
 // worked.
5d225ec
 
5d225ec
-static int iterations = 160000000;
5d225ec
+static int iterations = 40000000;
5d225ec
 static int numthreads = 16;
5d225ec
 static int ntrials = 1;
5d225ec
 static bool verbose = false;
5d225ec
@@ -58,6 +58,7 @@
5d225ec
 
5d225ec
 static spin_mutex print_mutex;  // make the prints not clobber each other
5d225ec
 volatile long long accum = 0;
5d225ec
+float faccum = 0;
5d225ec
 spin_mutex mymutex;
5d225ec
 
5d225ec
 
5d225ec
@@ -71,10 +72,22 @@
5d225ec
         std::cout << "thread " << boost::this_thread::get_id() 
5d225ec
                   << ", accum = " << accum << "\n";
5d225ec
     }
5d225ec
+#if 1
5d225ec
     for (int i = 0;  i < iterations;  ++i) {
5d225ec
         spin_lock lock (mymutex);
5d225ec
         accum += 1;
5d225ec
     }
5d225ec
+#else
5d225ec
+    // Alternate one that mixes in some math to make longer lock hold time,
5d225ec
+    // and also more to do between locks.  Interesting contrast in timings.
5d225ec
+    float last = 0.0f;
5d225ec
+    for (int i = 0;  i < iterations;  ++i) {
5d225ec
+        last = fmodf (sinf(last), 1.0f);
5d225ec
+        spin_lock lock (mymutex);
5d225ec
+        accum += 1;
5d225ec
+        faccum = fmod (sinf(faccum+last), 1.0f);
5d225ec
+    }
5d225ec
+#endif
5d225ec
 }
5d225ec
 
5d225ec
 
5d225ec
@@ -134,16 +147,15 @@ int main (int argc, char *argv[])
5d225ec
 
5d225ec
     static int threadcounts[] = { 1, 2, 4, 8, 12, 16, 20, 24, 28, 32, 64, 128, 1024, 1<<30 };
5d225ec
     for (int i = 0; threadcounts[i] <= numthreads; ++i) {
5d225ec
-        int nt = threadcounts[i];
5d225ec
+        int nt = wedge ? threadcounts[i] : numthreads;
5d225ec
         int its = iterations/nt;
5d225ec
 
5d225ec
         double range;
5d225ec
         double t = time_trial (boost::bind(test_spinlock,nt,its),
5d225ec
                                ntrials, &range);
5d225ec
 
5d225ec
-        std::cout << Strutil::format ("%2d\t%s\t%5.1fs, range %.1f\t(%d iters/thread)\n",
5d225ec
-                                      nt, Strutil::timeintervalformat(t),
5d225ec
-                                      t, range, its);
5d225ec
+        std::cout << Strutil::format ("%2d\t%5.1f   range %.2f\t(%d iters/thread)\n",
5d225ec
+                                      nt, t, range, its);
5d225ec
         if (! wedge)
5d225ec
             break;    // don't loop if we're not wedging
5d225ec
     }
5d225ec
diff --git a/src/libtexture/imagecache_pvt.h b/src/libtexture/imagecache_pvt.h
5d225ec
index 5d29782..3a49616 100644
5d225ec
--- a/src/libtexture/imagecache_pvt.h
5d225ec
+++ b/src/libtexture/imagecache_pvt.h
5d225ec
@@ -1003,7 +1003,7 @@ class ImageCacheImpl : public ImageCache {
5d225ec
             newval = oldval + incr;
5d225ec
             // Now try to atomically swap it, and repeat until we've
5d225ec
             // done it with nobody else interfering.
5d225ec
-#  if USE_TBB
5d225ec
+#  if USE_TBB_ATOMIC
5d225ec
         } while (llstat->compare_and_swap (*llnewval,*lloldval) != *lloldval);
5d225ec
 #  else
5d225ec
         } while (llstat->bool_compare_and_swap (*llnewval,*lloldval));
5d225ec
-- 
5d225ec
1.8.1.6
5d225ec