carlwgeorge / rpms / qemu

Forked from rpms/qemu a year ago
Clone
a7b9285
From efb8ea48a730325b4bb4cfd1b624e697b1522536 Mon Sep 17 00:00:00 2001
Alon Levy 408bdb5
From: Paolo Bonzini <pbonzini@redhat.com>
Alon Levy 408bdb5
Date: Fri, 22 Feb 2013 17:36:27 +0100
a7b9285
Subject: [PATCH] migration: run pending/iterate callbacks out of big lock
Alon Levy 408bdb5
Alon Levy 408bdb5
This makes it possible to do blocking writes directly to the socket,
Alon Levy 408bdb5
with no buffer in the middle.  For RAM, only the migration_bitmap_sync()
Alon Levy 408bdb5
call needs the iothread lock.  For block migration, it is needed by
Alon Levy 408bdb5
the block layer (including bdrv_drain_all and dirty bitmap access),
Alon Levy 408bdb5
but because some code is shared between iterate and complete, all of
Alon Levy 408bdb5
mig_save_device_dirty is run with the lock taken.
Alon Levy 408bdb5
Alon Levy 408bdb5
In the savevm case, the iterate callback runs within the big lock.
Alon Levy 408bdb5
This is annoying because it complicates the rules.  Luckily we do not
Alon Levy 408bdb5
need to do anything about it: the RAM iterate callback does not need
Alon Levy 408bdb5
the iothread lock, and block migration never runs during savevm.
Alon Levy 408bdb5
Alon Levy 408bdb5
Reviewed-by: Orit Wasserman <owasserm@redhat.com>
Alon Levy 408bdb5
Reviewed-by: Juan Quintela <quintela@redhat.com>
Alon Levy 408bdb5
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Alon Levy 408bdb5
Signed-off-by: Juan Quintela <quintela@redhat.com>
a7b9285
(cherry picked from commit 32c835ba3984728c22d4e73cdb595090a60f437e)
Alon Levy 408bdb5
---
Alon Levy 408bdb5
 arch_init.c                 |  4 ++++
Alon Levy 408bdb5
 block-migration.c           | 37 +++++++++++++++++++++++++++++++++++--
Alon Levy 408bdb5
 include/migration/vmstate.h | 11 +++++++++++
Alon Levy 408bdb5
 migration.c                 |  4 ++--
Alon Levy 408bdb5
 4 files changed, 52 insertions(+), 4 deletions(-)
Alon Levy 408bdb5
Alon Levy 408bdb5
diff --git a/arch_init.c b/arch_init.c
Alon Levy 408bdb5
index 8daeafa..32b4378 100644
Alon Levy 408bdb5
--- a/arch_init.c
Alon Levy 408bdb5
+++ b/arch_init.c
Alon Levy 408bdb5
@@ -379,6 +379,8 @@ static inline bool migration_bitmap_set_dirty(MemoryRegion *mr,
Alon Levy 408bdb5
     return ret;
Alon Levy 408bdb5
 }
Alon Levy 408bdb5
 
Alon Levy 408bdb5
+/* Needs iothread lock! */
Alon Levy 408bdb5
+
Alon Levy 408bdb5
 static void migration_bitmap_sync(void)
Alon Levy 408bdb5
 {
Alon Levy 408bdb5
     RAMBlock *block;
Alon Levy 408bdb5
@@ -690,7 +692,9 @@ static uint64_t ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size)
Alon Levy 408bdb5
     remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
Alon Levy 408bdb5
 
Alon Levy 408bdb5
     if (remaining_size < max_size) {
Alon Levy 408bdb5
+        qemu_mutex_lock_iothread();
Alon Levy 408bdb5
         migration_bitmap_sync();
Alon Levy 408bdb5
+        qemu_mutex_unlock_iothread();
Alon Levy 408bdb5
         remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
Alon Levy 408bdb5
     }
Alon Levy 408bdb5
     return remaining_size;
Alon Levy 408bdb5
diff --git a/block-migration.c b/block-migration.c
Alon Levy 408bdb5
index b726c6c..8da5f86 100644
Alon Levy 408bdb5
--- a/block-migration.c
Alon Levy 408bdb5
+++ b/block-migration.c
Alon Levy 408bdb5
@@ -107,6 +107,10 @@ static void blk_mig_unlock(void)
Alon Levy 408bdb5
     qemu_mutex_unlock(&block_mig_state.lock);
Alon Levy 408bdb5
 }
Alon Levy 408bdb5
 
Alon Levy 408bdb5
+/* Must run outside of the iothread lock during the bulk phase,
Alon Levy 408bdb5
+ * or the VM will stall.
Alon Levy 408bdb5
+ */
Alon Levy 408bdb5
+
Alon Levy 408bdb5
 static void blk_send(QEMUFile *f, BlkMigBlock * blk)
Alon Levy 408bdb5
 {
Alon Levy 408bdb5
     int len;
Alon Levy 408bdb5
@@ -226,6 +230,8 @@ static void blk_mig_read_cb(void *opaque, int ret)
Alon Levy 408bdb5
     blk_mig_unlock();
Alon Levy 408bdb5
 }
Alon Levy 408bdb5
 
Alon Levy 408bdb5
+/* Called with no lock taken.  */
Alon Levy 408bdb5
+
Alon Levy 408bdb5
 static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds)
Alon Levy 408bdb5
 {
Alon Levy 408bdb5
     int64_t total_sectors = bmds->total_sectors;
Alon Levy 408bdb5
@@ -235,11 +241,13 @@ static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds)
Alon Levy 408bdb5
     int nr_sectors;
Alon Levy 408bdb5
 
Alon Levy 408bdb5
     if (bmds->shared_base) {
Alon Levy 408bdb5
+        qemu_mutex_lock_iothread();
Alon Levy 408bdb5
         while (cur_sector < total_sectors &&
Alon Levy 408bdb5
                !bdrv_is_allocated(bs, cur_sector, MAX_IS_ALLOCATED_SEARCH,
Alon Levy 408bdb5
                                   &nr_sectors)) {
Alon Levy 408bdb5
             cur_sector += nr_sectors;
Alon Levy 408bdb5
         }
Alon Levy 408bdb5
+        qemu_mutex_unlock_iothread();
Alon Levy 408bdb5
     }
Alon Levy 408bdb5
 
Alon Levy 408bdb5
     if (cur_sector >= total_sectors) {
Alon Levy 408bdb5
@@ -272,15 +280,19 @@ static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds)
Alon Levy 408bdb5
     block_mig_state.submitted++;
Alon Levy 408bdb5
     blk_mig_unlock();
Alon Levy 408bdb5
 
Alon Levy 408bdb5
+    qemu_mutex_lock_iothread();
Alon Levy 408bdb5
     blk->aiocb = bdrv_aio_readv(bs, cur_sector, &blk->qiov,
Alon Levy 408bdb5
                                 nr_sectors, blk_mig_read_cb, blk);
Alon Levy 408bdb5
 
Alon Levy 408bdb5
     bdrv_reset_dirty(bs, cur_sector, nr_sectors);
Alon Levy 408bdb5
-    bmds->cur_sector = cur_sector + nr_sectors;
Alon Levy 408bdb5
+    qemu_mutex_unlock_iothread();
Alon Levy 408bdb5
 
Alon Levy 408bdb5
+    bmds->cur_sector = cur_sector + nr_sectors;
Alon Levy 408bdb5
     return (bmds->cur_sector >= total_sectors);
Alon Levy 408bdb5
 }
Alon Levy 408bdb5
 
Alon Levy 408bdb5
+/* Called with iothread lock taken.  */
Alon Levy 408bdb5
+
Alon Levy 408bdb5
 static void set_dirty_tracking(int enable)
Alon Levy 408bdb5
 {
Alon Levy 408bdb5
     BlkMigDevState *bmds;
Alon Levy 408bdb5
@@ -336,6 +348,8 @@ static void init_blk_migration(QEMUFile *f)
Alon Levy 408bdb5
     bdrv_iterate(init_blk_migration_it, NULL);
Alon Levy 408bdb5
 }
Alon Levy 408bdb5
 
Alon Levy 408bdb5
+/* Called with no lock taken.  */
Alon Levy 408bdb5
+
Alon Levy 408bdb5
 static int blk_mig_save_bulked_block(QEMUFile *f)
Alon Levy 408bdb5
 {
Alon Levy 408bdb5
     int64_t completed_sector_sum = 0;
Alon Levy 408bdb5
@@ -382,6 +396,8 @@ static void blk_mig_reset_dirty_cursor(void)
Alon Levy 408bdb5
     }
Alon Levy 408bdb5
 }
Alon Levy 408bdb5
 
Alon Levy 408bdb5
+/* Called with iothread lock taken.  */
Alon Levy 408bdb5
+
Alon Levy 408bdb5
 static int mig_save_device_dirty(QEMUFile *f, BlkMigDevState *bmds,
Alon Levy 408bdb5
                                  int is_async)
Alon Levy 408bdb5
 {
Alon Levy 408bdb5
@@ -451,7 +467,9 @@ error:
Alon Levy 408bdb5
     return ret;
Alon Levy 408bdb5
 }
Alon Levy 408bdb5
 
Alon Levy 408bdb5
-/* return value:
Alon Levy 408bdb5
+/* Called with iothread lock taken.
Alon Levy 408bdb5
+ *
Alon Levy 408bdb5
+ * return value:
Alon Levy 408bdb5
  * 0: too much data for max_downtime
Alon Levy 408bdb5
  * 1: few enough data for max_downtime
Alon Levy 408bdb5
 */
Alon Levy 408bdb5
@@ -470,6 +488,8 @@ static int blk_mig_save_dirty_block(QEMUFile *f, int is_async)
Alon Levy 408bdb5
     return ret;
Alon Levy 408bdb5
 }
Alon Levy 408bdb5
 
Alon Levy 408bdb5
+/* Called with no locks taken.  */
Alon Levy 408bdb5
+
Alon Levy 408bdb5
 static int flush_blks(QEMUFile *f)
Alon Levy 408bdb5
 {
Alon Levy 408bdb5
     BlkMigBlock *blk;
Alon Levy 408bdb5
@@ -509,6 +529,8 @@ static int flush_blks(QEMUFile *f)
Alon Levy 408bdb5
     return ret;
Alon Levy 408bdb5
 }
Alon Levy 408bdb5
 
Alon Levy 408bdb5
+/* Called with iothread lock taken.  */
Alon Levy 408bdb5
+
Alon Levy 408bdb5
 static int64_t get_remaining_dirty(void)
Alon Levy 408bdb5
 {
Alon Levy 408bdb5
     BlkMigDevState *bmds;
Alon Levy 408bdb5
@@ -521,6 +543,8 @@ static int64_t get_remaining_dirty(void)
Alon Levy 408bdb5
     return dirty << BDRV_SECTOR_BITS;
Alon Levy 408bdb5
 }
Alon Levy 408bdb5
 
Alon Levy 408bdb5
+/* Called with iothread lock taken.  */
Alon Levy 408bdb5
+
Alon Levy 408bdb5
 static void blk_mig_cleanup(void)
Alon Levy 408bdb5
 {
Alon Levy 408bdb5
     BlkMigDevState *bmds;
Alon Levy 408bdb5
@@ -600,7 +624,12 @@ static int block_save_iterate(QEMUFile *f, void *opaque)
Alon Levy 408bdb5
             }
Alon Levy 408bdb5
             ret = 0;
Alon Levy 408bdb5
         } else {
Alon Levy 408bdb5
+            /* Always called with iothread lock taken for
Alon Levy 408bdb5
+             * simplicity, block_save_complete also calls it.
Alon Levy 408bdb5
+             */
Alon Levy 408bdb5
+            qemu_mutex_lock_iothread();
Alon Levy 408bdb5
             ret = blk_mig_save_dirty_block(f, 1);
Alon Levy 408bdb5
+            qemu_mutex_unlock_iothread();
Alon Levy 408bdb5
         }
Alon Levy 408bdb5
         if (ret < 0) {
Alon Levy 408bdb5
             return ret;
Alon Levy 408bdb5
@@ -622,6 +651,8 @@ static int block_save_iterate(QEMUFile *f, void *opaque)
Alon Levy 408bdb5
     return qemu_ftell(f) - last_ftell;
Alon Levy 408bdb5
 }
Alon Levy 408bdb5
 
Alon Levy 408bdb5
+/* Called with iothread lock taken.  */
Alon Levy 408bdb5
+
Alon Levy 408bdb5
 static int block_save_complete(QEMUFile *f, void *opaque)
Alon Levy 408bdb5
 {
Alon Levy 408bdb5
     int ret;
Alon Levy 408bdb5
@@ -665,6 +696,7 @@ static uint64_t block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size)
Alon Levy 408bdb5
     /* Estimate pending number of bytes to send */
Alon Levy 408bdb5
     uint64_t pending;
Alon Levy 408bdb5
 
Alon Levy 408bdb5
+    qemu_mutex_lock_iothread();
Alon Levy 408bdb5
     blk_mig_lock();
Alon Levy 408bdb5
     pending = get_remaining_dirty() +
Alon Levy 408bdb5
                        block_mig_state.submitted * BLOCK_SIZE +
Alon Levy 408bdb5
@@ -675,6 +707,7 @@ static uint64_t block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size)
Alon Levy 408bdb5
         pending = BLOCK_SIZE;
Alon Levy 408bdb5
     }
Alon Levy 408bdb5
     blk_mig_unlock();
Alon Levy 408bdb5
+    qemu_mutex_unlock_iothread();
Alon Levy 408bdb5
 
Alon Levy 408bdb5
     DPRINTF("Enter save live pending  %" PRIu64 "\n", pending);
Alon Levy 408bdb5
     return pending;
Alon Levy 408bdb5
diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h
Alon Levy 408bdb5
index 6229569..5f803f5 100644
Alon Levy 408bdb5
--- a/include/migration/vmstate.h
Alon Levy 408bdb5
+++ b/include/migration/vmstate.h
Alon Levy 408bdb5
@@ -30,14 +30,25 @@ typedef void SaveStateHandler(QEMUFile *f, void *opaque);
Alon Levy 408bdb5
 typedef int LoadStateHandler(QEMUFile *f, void *opaque, int version_id);
Alon Levy 408bdb5
 
Alon Levy 408bdb5
 typedef struct SaveVMHandlers {
Alon Levy 408bdb5
+    /* This runs inside the iothread lock.  */
Alon Levy 408bdb5
     void (*set_params)(const MigrationParams *params, void * opaque);
Alon Levy 408bdb5
     SaveStateHandler *save_state;
Alon Levy 408bdb5
 
Alon Levy 408bdb5
     int (*save_live_setup)(QEMUFile *f, void *opaque);
Alon Levy 408bdb5
     void (*cancel)(void *opaque);
Alon Levy 408bdb5
     int (*save_live_complete)(QEMUFile *f, void *opaque);
Alon Levy 408bdb5
+
Alon Levy 408bdb5
+    /* This runs both outside and inside the iothread lock.  */
Alon Levy 408bdb5
     bool (*is_active)(void *opaque);
Alon Levy 408bdb5
+
Alon Levy 408bdb5
+    /* This runs outside the iothread lock in the migration case, and
Alon Levy 408bdb5
+     * within the lock in the savevm case.  The callback had better only
Alon Levy 408bdb5
+     * use data that is local to the migration thread or protected
Alon Levy 408bdb5
+     * by other locks.
Alon Levy 408bdb5
+     */
Alon Levy 408bdb5
     int (*save_live_iterate)(QEMUFile *f, void *opaque);
Alon Levy 408bdb5
+
Alon Levy 408bdb5
+    /* This runs outside the iothread lock!  */
Alon Levy 408bdb5
     uint64_t (*save_live_pending)(QEMUFile *f, void *opaque, uint64_t max_size);
Alon Levy 408bdb5
 
Alon Levy 408bdb5
     LoadStateHandler *load_state;
Alon Levy 408bdb5
diff --git a/migration.c b/migration.c
Alon Levy 408bdb5
index 437475b..87b5009 100644
Alon Levy 408bdb5
--- a/migration.c
Alon Levy 408bdb5
+++ b/migration.c
Alon Levy 408bdb5
@@ -670,7 +670,6 @@ static void *buffered_file_thread(void *opaque)
Alon Levy 408bdb5
         uint64_t pending_size;
Alon Levy 408bdb5
 
Alon Levy 408bdb5
         if (s->bytes_xfer < s->xfer_limit) {
Alon Levy 408bdb5
-            qemu_mutex_lock_iothread();
Alon Levy 408bdb5
             DPRINTF("iterate\n");
Alon Levy 408bdb5
             pending_size = qemu_savevm_state_pending(s->file, max_size);
Alon Levy 408bdb5
             DPRINTF("pending size %lu max %lu\n", pending_size, max_size);
Alon Levy 408bdb5
@@ -678,6 +677,7 @@ static void *buffered_file_thread(void *opaque)
Alon Levy 408bdb5
                 qemu_savevm_state_iterate(s->file);
Alon Levy 408bdb5
             } else {
Alon Levy 408bdb5
                 DPRINTF("done iterating\n");
Alon Levy 408bdb5
+                qemu_mutex_lock_iothread();
Alon Levy 408bdb5
                 start_time = qemu_get_clock_ms(rt_clock);
Alon Levy 408bdb5
                 qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER);
Alon Levy 408bdb5
                 old_vm_running = runstate_is_running();
Alon Levy 408bdb5
@@ -685,8 +685,8 @@ static void *buffered_file_thread(void *opaque)
Alon Levy 408bdb5
                 s->xfer_limit = INT_MAX;
Alon Levy 408bdb5
                 qemu_savevm_state_complete(s->file);
Alon Levy 408bdb5
                 last_round = true;
Alon Levy 408bdb5
+                qemu_mutex_unlock_iothread();
Alon Levy 408bdb5
             }
Alon Levy 408bdb5
-            qemu_mutex_unlock_iothread();
Alon Levy 408bdb5
         }
Alon Levy 408bdb5
 
Alon Levy 408bdb5
         current_time = qemu_get_clock_ms(rt_clock);