cda8f0
switch to write-biased r/w locks
cda8f0
cda8f0
This is to improve fairness: A permanent flow of read acquires can
cda8f0
otherwise lock out eventual writers indefinitely.
cda8f0
cda8f0
This is XSA-114 / CVE-2014-9065.
cda8f0
cda8f0
Signed-off-by: Keir Fraser <keir@xen.org>
cda8f0
Reviewed-by: Jan Beulich <jbeulich@suse.com>
cda8f0
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
cda8f0
Tested-by: Andrew Cooper <andrew.cooper3@citrix.com>
cda8f0
cda8f0
--- a/xen/common/spinlock.c
cda8f0
+++ b/xen/common/spinlock.c
cda8f0
@@ -271,112 +271,151 @@ void _spin_unlock_recursive(spinlock_t *
cda8f0
 
cda8f0
 void _read_lock(rwlock_t *lock)
cda8f0
 {
cda8f0
+    uint32_t x;
cda8f0
+
cda8f0
     check_lock(&lock->debug);
cda8f0
-    while ( unlikely(!_raw_read_trylock(&lock->raw)) )
cda8f0
-    {
cda8f0
-        while ( likely(_raw_rw_is_write_locked(&lock->raw)) )
cda8f0
+    do {
cda8f0
+        while ( (x = lock->lock) & RW_WRITE_FLAG )
cda8f0
             cpu_relax();
cda8f0
-    }
cda8f0
+    } while ( cmpxchg(&lock->lock, x, x+1) != x );
cda8f0
     preempt_disable();
cda8f0
 }
cda8f0
 
cda8f0
 void _read_lock_irq(rwlock_t *lock)
cda8f0
 {
cda8f0
+    uint32_t x;
cda8f0
+
cda8f0
     ASSERT(local_irq_is_enabled());
cda8f0
     local_irq_disable();
cda8f0
     check_lock(&lock->debug);
cda8f0
-    while ( unlikely(!_raw_read_trylock(&lock->raw)) )
cda8f0
-    {
cda8f0
-        local_irq_enable();
cda8f0
-        while ( likely(_raw_rw_is_write_locked(&lock->raw)) )
cda8f0
-            cpu_relax();
cda8f0
-        local_irq_disable();
cda8f0
-    }
cda8f0
+    do {
cda8f0
+        if ( (x = lock->lock) & RW_WRITE_FLAG )
cda8f0
+        {
cda8f0
+            local_irq_enable();
cda8f0
+            while ( (x = lock->lock) & RW_WRITE_FLAG )
cda8f0
+                cpu_relax();
cda8f0
+            local_irq_disable();
cda8f0
+        }
cda8f0
+    } while ( cmpxchg(&lock->lock, x, x+1) != x );
cda8f0
     preempt_disable();
cda8f0
 }
cda8f0
 
cda8f0
 unsigned long _read_lock_irqsave(rwlock_t *lock)
cda8f0
 {
cda8f0
+    uint32_t x;
cda8f0
     unsigned long flags;
cda8f0
+
cda8f0
     local_irq_save(flags);
cda8f0
     check_lock(&lock->debug);
cda8f0
-    while ( unlikely(!_raw_read_trylock(&lock->raw)) )
cda8f0
-    {
cda8f0
-        local_irq_restore(flags);
cda8f0
-        while ( likely(_raw_rw_is_write_locked(&lock->raw)) )
cda8f0
-            cpu_relax();
cda8f0
-        local_irq_save(flags);
cda8f0
-    }
cda8f0
+    do {
cda8f0
+        if ( (x = lock->lock) & RW_WRITE_FLAG )
cda8f0
+        {
cda8f0
+            local_irq_restore(flags);
cda8f0
+            while ( (x = lock->lock) & RW_WRITE_FLAG )
cda8f0
+                cpu_relax();
cda8f0
+            local_irq_save(flags);
cda8f0
+        }
cda8f0
+    } while ( cmpxchg(&lock->lock, x, x+1) != x );
cda8f0
     preempt_disable();
cda8f0
     return flags;
cda8f0
 }
cda8f0
 
cda8f0
 int _read_trylock(rwlock_t *lock)
cda8f0
 {
cda8f0
+    uint32_t x;
cda8f0
+
cda8f0
     check_lock(&lock->debug);
cda8f0
-    if ( !_raw_read_trylock(&lock->raw) )
cda8f0
-        return 0;
cda8f0
+    do {
cda8f0
+        if ( (x = lock->lock) & RW_WRITE_FLAG )
cda8f0
+            return 0;
cda8f0
+    } while ( cmpxchg(&lock->lock, x, x+1) != x );
cda8f0
     preempt_disable();
cda8f0
     return 1;
cda8f0
 }
cda8f0
 
cda8f0
 void _read_unlock(rwlock_t *lock)
cda8f0
 {
cda8f0
+    uint32_t x, y;
cda8f0
+
cda8f0
     preempt_enable();
cda8f0
-    _raw_read_unlock(&lock->raw);
cda8f0
+    x = lock->lock;
cda8f0
+    while ( (y = cmpxchg(&lock->lock, x, x-1)) != x )
cda8f0
+        x = y;
cda8f0
 }
cda8f0
 
cda8f0
 void _read_unlock_irq(rwlock_t *lock)
cda8f0
 {
cda8f0
-    preempt_enable();
cda8f0
-    _raw_read_unlock(&lock->raw);
cda8f0
+    _read_unlock(lock);
cda8f0
     local_irq_enable();
cda8f0
 }
cda8f0
 
cda8f0
 void _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
cda8f0
 {
cda8f0
-    preempt_enable();
cda8f0
-    _raw_read_unlock(&lock->raw);
cda8f0
+    _read_unlock(lock);
cda8f0
     local_irq_restore(flags);
cda8f0
 }
cda8f0
 
cda8f0
 void _write_lock(rwlock_t *lock)
cda8f0
 {
cda8f0
+    uint32_t x;
cda8f0
+
cda8f0
     check_lock(&lock->debug);
cda8f0
-    while ( unlikely(!_raw_write_trylock(&lock->raw)) )
cda8f0
-    {
cda8f0
-        while ( likely(_raw_rw_is_locked(&lock->raw)) )
cda8f0
+    do {
cda8f0
+        while ( (x = lock->lock) & RW_WRITE_FLAG )
cda8f0
             cpu_relax();
cda8f0
+    } while ( cmpxchg(&lock->lock, x, x|RW_WRITE_FLAG) != x );
cda8f0
+    while ( x != 0 )
cda8f0
+    {
cda8f0
+        cpu_relax();
cda8f0
+        x = lock->lock & ~RW_WRITE_FLAG;
cda8f0
     }
cda8f0
     preempt_disable();
cda8f0
 }
cda8f0
 
cda8f0
 void _write_lock_irq(rwlock_t *lock)
cda8f0
 {
cda8f0
+    uint32_t x;
cda8f0
+
cda8f0
     ASSERT(local_irq_is_enabled());
cda8f0
     local_irq_disable();
cda8f0
     check_lock(&lock->debug);
cda8f0
-    while ( unlikely(!_raw_write_trylock(&lock->raw)) )
cda8f0
+    do {
cda8f0
+        if ( (x = lock->lock) & RW_WRITE_FLAG )
cda8f0
+        {
cda8f0
+            local_irq_enable();
cda8f0
+            while ( (x = lock->lock) & RW_WRITE_FLAG )
cda8f0
+                cpu_relax();
cda8f0
+            local_irq_disable();
cda8f0
+        }
cda8f0
+    } while ( cmpxchg(&lock->lock, x, x|RW_WRITE_FLAG) != x );
cda8f0
+    while ( x != 0 )
cda8f0
     {
cda8f0
-        local_irq_enable();
cda8f0
-        while ( likely(_raw_rw_is_locked(&lock->raw)) )
cda8f0
-            cpu_relax();
cda8f0
-        local_irq_disable();
cda8f0
+        cpu_relax();
cda8f0
+        x = lock->lock & ~RW_WRITE_FLAG;
cda8f0
     }
cda8f0
     preempt_disable();
cda8f0
 }
cda8f0
 
cda8f0
 unsigned long _write_lock_irqsave(rwlock_t *lock)
cda8f0
 {
cda8f0
+    uint32_t x;
cda8f0
     unsigned long flags;
cda8f0
+
cda8f0
     local_irq_save(flags);
cda8f0
     check_lock(&lock->debug);
cda8f0
-    while ( unlikely(!_raw_write_trylock(&lock->raw)) )
cda8f0
+    do {
cda8f0
+        if ( (x = lock->lock) & RW_WRITE_FLAG )
cda8f0
+        {
cda8f0
+            local_irq_restore(flags);
cda8f0
+            while ( (x = lock->lock) & RW_WRITE_FLAG )
cda8f0
+                cpu_relax();
cda8f0
+            local_irq_save(flags);
cda8f0
+        }
cda8f0
+    } while ( cmpxchg(&lock->lock, x, x|RW_WRITE_FLAG) != x );
cda8f0
+    while ( x != 0 )
cda8f0
     {
cda8f0
-        local_irq_restore(flags);
cda8f0
-        while ( likely(_raw_rw_is_locked(&lock->raw)) )
cda8f0
-            cpu_relax();
cda8f0
-        local_irq_save(flags);
cda8f0
+        cpu_relax();
cda8f0
+        x = lock->lock & ~RW_WRITE_FLAG;
cda8f0
     }
cda8f0
     preempt_disable();
cda8f0
     return flags;
cda8f0
@@ -384,9 +423,13 @@ unsigned long _write_lock_irqsave(rwlock
cda8f0
 
cda8f0
 int _write_trylock(rwlock_t *lock)
cda8f0
 {
cda8f0
+    uint32_t x;
cda8f0
+
cda8f0
     check_lock(&lock->debug);
cda8f0
-    if ( !_raw_write_trylock(&lock->raw) )
cda8f0
-        return 0;
cda8f0
+    do {
cda8f0
+        if ( (x = lock->lock) != 0 )
cda8f0
+            return 0;
cda8f0
+    } while ( cmpxchg(&lock->lock, x, x|RW_WRITE_FLAG) != x );
cda8f0
     preempt_disable();
cda8f0
     return 1;
cda8f0
 }
cda8f0
@@ -394,33 +437,32 @@ int _write_trylock(rwlock_t *lock)
cda8f0
 void _write_unlock(rwlock_t *lock)
cda8f0
 {
cda8f0
     preempt_enable();
cda8f0
-    _raw_write_unlock(&lock->raw);
cda8f0
+    if ( cmpxchg(&lock->lock, RW_WRITE_FLAG, 0) != RW_WRITE_FLAG )
cda8f0
+        BUG();
cda8f0
 }
cda8f0
 
cda8f0
 void _write_unlock_irq(rwlock_t *lock)
cda8f0
 {
cda8f0
-    preempt_enable();
cda8f0
-    _raw_write_unlock(&lock->raw);
cda8f0
+    _write_unlock(lock);
cda8f0
     local_irq_enable();
cda8f0
 }
cda8f0
 
cda8f0
 void _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
cda8f0
 {
cda8f0
-    preempt_enable();
cda8f0
-    _raw_write_unlock(&lock->raw);
cda8f0
+    _write_unlock(lock);
cda8f0
     local_irq_restore(flags);
cda8f0
 }
cda8f0
 
cda8f0
 int _rw_is_locked(rwlock_t *lock)
cda8f0
 {
cda8f0
     check_lock(&lock->debug);
cda8f0
-    return _raw_rw_is_locked(&lock->raw);
cda8f0
+    return (lock->lock != 0); /* anyone in critical section? */
cda8f0
 }
cda8f0
 
cda8f0
 int _rw_is_write_locked(rwlock_t *lock)
cda8f0
 {
cda8f0
     check_lock(&lock->debug);
cda8f0
-    return _raw_rw_is_write_locked(&lock->raw);
cda8f0
+    return (lock->lock == RW_WRITE_FLAG); /* writer in critical section? */
cda8f0
 }
cda8f0
 
cda8f0
 #ifdef LOCK_PROFILE
cda8f0
--- a/xen/include/asm-arm/spinlock.h
cda8f0
+++ b/xen/include/asm-arm/spinlock.h
cda8f0
@@ -55,84 +55,6 @@ static always_inline int _raw_spin_trylo
cda8f0
     }
cda8f0
 }
cda8f0
 
cda8f0
-typedef struct {
cda8f0
-    volatile unsigned int lock;
cda8f0
-} raw_rwlock_t;
cda8f0
-
cda8f0
-#define _RAW_RW_LOCK_UNLOCKED { 0 }
cda8f0
-
cda8f0
-static always_inline int _raw_read_trylock(raw_rwlock_t *rw)
cda8f0
-{
cda8f0
-    unsigned long tmp, tmp2 = 1;
cda8f0
-
cda8f0
-    __asm__ __volatile__(
cda8f0
-"1: ldrex   %0, [%2]\n"
cda8f0
-"   adds    %0, %0, #1\n"
cda8f0
-"   strexpl %1, %0, [%2]\n"
cda8f0
-    : "=&r" (tmp), "+r" (tmp2)
cda8f0
-    : "r" (&rw->lock)
cda8f0
-    : "cc");
cda8f0
-
cda8f0
-    smp_mb();
cda8f0
-    return tmp2 == 0;
cda8f0
-}
cda8f0
-
cda8f0
-static always_inline int _raw_write_trylock(raw_rwlock_t *rw)
cda8f0
-{
cda8f0
-    unsigned long tmp;
cda8f0
-
cda8f0
-    __asm__ __volatile__(
cda8f0
-"1: ldrex   %0, [%1]\n"
cda8f0
-"   teq     %0, #0\n"
cda8f0
-"   strexeq %0, %2, [%1]"
cda8f0
-    : "=&r" (tmp)
cda8f0
-    : "r" (&rw->lock), "r" (0x80000000)
cda8f0
-    : "cc");
cda8f0
-
cda8f0
-    if (tmp == 0) {
cda8f0
-        smp_mb();
cda8f0
-        return 1;
cda8f0
-    } else {
cda8f0
-        return 0;
cda8f0
-    }
cda8f0
-}
cda8f0
-
cda8f0
-static inline void _raw_read_unlock(raw_rwlock_t *rw)
cda8f0
-{
cda8f0
-    unsigned long tmp, tmp2;
cda8f0
-
cda8f0
-    smp_mb();
cda8f0
-
cda8f0
-    __asm__ __volatile__(
cda8f0
-"1: ldrex   %0, [%2]\n"
cda8f0
-"   sub     %0, %0, #1\n"
cda8f0
-"   strex   %1, %0, [%2]\n"
cda8f0
-"   teq     %1, #0\n"
cda8f0
-"   bne     1b"
cda8f0
-    : "=&r" (tmp), "=&r" (tmp2)
cda8f0
-    : "r" (&rw->lock)
cda8f0
-    : "cc");
cda8f0
-
cda8f0
-    if (tmp == 0)
cda8f0
-        dsb_sev();
cda8f0
-}
cda8f0
-
cda8f0
-static inline void _raw_write_unlock(raw_rwlock_t *rw)
cda8f0
-{
cda8f0
-    smp_mb();
cda8f0
-
cda8f0
-    __asm__ __volatile__(
cda8f0
-    "str    %1, [%0]\n"
cda8f0
-    :
cda8f0
-    : "r" (&rw->lock), "r" (0)
cda8f0
-    : "cc");
cda8f0
-
cda8f0
-    dsb_sev();
cda8f0
-}
cda8f0
-
cda8f0
-#define _raw_rw_is_locked(x) ((x)->lock != 0)
cda8f0
-#define _raw_rw_is_write_locked(x) ((x)->lock == 0x80000000)
cda8f0
-
cda8f0
 #endif /* __ASM_SPINLOCK_H */
cda8f0
 /*
cda8f0
  * Local variables:
cda8f0
--- a/xen/include/asm-x86/spinlock.h
cda8f0
+++ b/xen/include/asm-x86/spinlock.h
cda8f0
@@ -31,58 +31,4 @@ static always_inline int _raw_spin_trylo
cda8f0
     return (oldval > 0);
cda8f0
 }
cda8f0
 
cda8f0
-typedef struct {
cda8f0
-    volatile int lock;
cda8f0
-} raw_rwlock_t;
cda8f0
-
cda8f0
-#define RW_WRITE_BIAS 0x7fffffff
cda8f0
-#define _RAW_RW_LOCK_UNLOCKED /*(raw_rwlock_t)*/ { 0 }
cda8f0
-
cda8f0
-static always_inline int _raw_read_trylock(raw_rwlock_t *rw)
cda8f0
-{
cda8f0
-    int acquired;
cda8f0
-
cda8f0
-    asm volatile (
cda8f0
-        "    lock; decl %0         \n"
cda8f0
-        "    jns 2f                \n"
cda8f0
-#ifdef __clang__ /* clang's builtin assember can't do .subsection */
cda8f0
-        "1:  .pushsection .fixup,\"ax\"\n"
cda8f0
-#else
cda8f0
-        "1:  .subsection 1         \n"
cda8f0
-#endif
cda8f0
-        "2:  lock; incl %0         \n"
cda8f0
-        "    decl %1               \n"
cda8f0
-        "    jmp 1b                \n"
cda8f0
-#ifdef __clang__
cda8f0
-        "    .popsection           \n"
cda8f0
-#else
cda8f0
-        "    .subsection 0         \n"
cda8f0
-#endif
cda8f0
-        : "=m" (rw->lock), "=r" (acquired) : "1" (1) : "memory" );
cda8f0
-
cda8f0
-    return acquired;
cda8f0
-}
cda8f0
-
cda8f0
-static always_inline int _raw_write_trylock(raw_rwlock_t *rw)
cda8f0
-{
cda8f0
-    return (cmpxchg(&rw->lock, 0, RW_WRITE_BIAS) == 0);
cda8f0
-}
cda8f0
-
cda8f0
-static always_inline void _raw_read_unlock(raw_rwlock_t *rw)
cda8f0
-{
cda8f0
-    asm volatile (
cda8f0
-        "lock ; incl %0"
cda8f0
-        : "=m" ((rw)->lock) : : "memory" );
cda8f0
-}
cda8f0
-
cda8f0
-static always_inline void _raw_write_unlock(raw_rwlock_t *rw)
cda8f0
-{
cda8f0
-    asm volatile (
cda8f0
-        "lock ; subl %1,%0"
cda8f0
-        : "=m" ((rw)->lock) : "i" (RW_WRITE_BIAS) : "memory" );
cda8f0
-}
cda8f0
-
cda8f0
-#define _raw_rw_is_locked(x) ((x)->lock != 0)
cda8f0
-#define _raw_rw_is_write_locked(x) ((x)->lock > 0)
cda8f0
-
cda8f0
 #endif /* __ASM_SPINLOCK_H */
cda8f0
--- a/xen/include/xen/spinlock.h
cda8f0
+++ b/xen/include/xen/spinlock.h
cda8f0
@@ -141,11 +141,13 @@ typedef struct spinlock {
cda8f0
 #define spin_lock_init(l) (*(l) = (spinlock_t)SPIN_LOCK_UNLOCKED)
cda8f0
 
cda8f0
 typedef struct {
cda8f0
-    raw_rwlock_t raw;
cda8f0
+    volatile uint32_t lock;
cda8f0
     struct lock_debug debug;
cda8f0
 } rwlock_t;
cda8f0
 
cda8f0
-#define RW_LOCK_UNLOCKED { _RAW_RW_LOCK_UNLOCKED, _LOCK_DEBUG }
cda8f0
+#define RW_WRITE_FLAG (1u<<31)
cda8f0
+
cda8f0
+#define RW_LOCK_UNLOCKED { 0, _LOCK_DEBUG }
cda8f0
 #define DEFINE_RWLOCK(l) rwlock_t l = RW_LOCK_UNLOCKED
cda8f0
 #define rwlock_init(l) (*(l) = (rwlock_t)RW_LOCK_UNLOCKED)
cda8f0