Blob Blame History Raw
diff --git a/ChangeLog b/ChangeLog
index d70edbd..e445890 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,4 +1,218 @@
 
+* Wed Jun 24 2015 Andrew Beekhof <andrew@beekhof.net> Pacemaker-1.1.13-1
+- Update source tarball to revision: 2a1847e
+- Changesets: 750
+- Diff:       156 files changed, 11323 insertions(+), 3725 deletions(-)
+
+- Features added since Pacemaker-1.1.12
+  + Allow fail-counts to be removed en-mass when the new attrd is in operation
+  + attrd supports private attributes (not written to CIB)
+  + crmd: Ensure a watchdog device is in use if stonith-watchdog-timeout is configured
+  + crmd: If configured, trigger the watchdog immediately if we loose quorum and no-quorum-policy=suicide
+  + crm_diff: Support generating a difference without versions details if --no-version/-u is supplied
+  + crm_resource: Implement an intelligent restart capability
+  + Fencing: Advertise the watchdog device for fencing operations
+  + Fencing: Allow the cluster to recover resources if the watchdog is in use
+  + fencing: cl#5134 - Support random fencing delay to avoid double fencing
+  + mcp: Allow orphan children to initiate node panic via SIGQUIT
+  + mcp: Turn on sbd integration if pacemakerd finds it running
+  + mcp: Two new error codes that result in machine reset or power off
+  + Officially support the resource-discovery attribute for location constraints
+  + PE: Allow natural ordering of colocation sets
+  + PE: Support non-actionable degraded mode for OCF
+  + pengine: cl#5207 - Display "UNCLEAN" for resources running on unclean offline nodes
+  + remote: pcmk remote client tool for use with container wrapper script
+  + Support machine panics for some kinds of errors (via sbd if available)
+  + tools: add crm_resource --wait option
+  + tools: attrd_updater supports --query and --all options
+  + tools: attrd_updater: Allow attributes to be set for other nodes
+
+- Changes since Pacemaker-1.1.12
+  + pengine: exclusive discovery implies rsc is only allowed on exclusive subset of nodes
+  + acl: Correctly implement the 'reference' acl directive
+  + acl: Do not delay evaluation of added nodes in some situations
+  + attrd: b22b1fe did uuid test too early
+  + attrd: Clean out the node cache when requested by the admin
+  + attrd: fixes double free in attrd legacy
+  + attrd: properly write attributes for peers once uuid is discovered
+  + attrd: refresh should force an immediate write-out of all attributes
+  + attrd: Simplify how node deletions happen
+  + Bug rhbz#1067544 - Tools: Correctly handle --ban, --move and --locate for master/slave groups
+  + Bug rhbz#1181824 - Ensure the DC can be reliably fenced
+  + cib: Ability to upgrade cib validation schema in legacy mode
+  + cib: Always generate digests for cib diffs in legacy mode
+  + cib: assignment where comparison intended
+  + cib: Avoid nodeid conflicts we don't care about
+  + cib: Correctly add "update-origin", "update-client" and "update-user" attributes for cib
+  + cib: Correctly set up signal handlers
+  + cib: Correctly track node state
+  + cib: Do not update on disk backups if we're just querying them
+  + cib: Enable cib legacy mode for plugin-based clusters
+  + cib: Ensure file-based backends treat '-o section' consistently with the native backend
+  + cib: Ensure upgrade operations from a non-DC get an acknowledgement
+  + cib: No need to enforce cib digests for v2 diffs in legacy mode
+  + cib: Revert d153b86 to instantly get cib synchronized in legacy mode
+  + cib: tls sock cleanup for remote cib connections
+  + cli: Ensure subsequent unknown long options are correctly detected
+  + cluster: Invoke crm_remove_conflicting_peer() only when the new node's uname is being assigned in the node cache
+  + common: Increment current and age for lib common as a result of APIs being added
+  + corosync:  Bug cl#5232 - Somewhat gracefully handle nodes with invalid UUIDs
+  + corosync: Avoid unnecessary repeated CMAP API calls
+  + crmd/pengine: handle on-fail=ignore properly
+  + crmd: Add "on_node" attribute for *_last_failure_0 lrm resource operations
+  + crmd: All peers need to track node shutdown requests
+  + crmd: Cached copies of transient attributes cease to be valid once a node leaves the membership
+  + crmd: Correctly add the local option that validates against schema for pengine to calculate
+  + crmd: Disable debug logging that results in significant overhead
+  + crmd: do not remove connection resources during re-probe
+  + crmd: don't update fail count twice for same failure
+  + crmd: Ensure remote connection resources timeout properly during 'migrate_from' action
+  + crmd: Ensure throttle_mode() does something on Linux
+  + crmd: Fixes crash when remote connection migration fails
+  + crmd: gracefully handle remote node disconnects during op execution
+  + crmd: Handle remote connection failures while executing ops on remote connection
+  + crmd: include remote nodes when forcing cluster wide resource reprobe
+  + crmd: never stop recurring monitor ops for pcmk remote during incomplete migration
+  + crmd: Prevent the old version of DC from being fenced when it shuts down for rolling-upgrade
+  + crmd: Prevent use-of-NULL during reprobe
+  + crmd: properly update job limit for baremetal remote-nodes
+  + crmd: Remote-node throttle jobs count towards cluster-node hosting conneciton rsc
+  + crmd: Reset stonith failcount to recover transitioner when the node rejoins
+  + crmd: resolves memory leak in crmd.
+  + crmd: respect start-failure-is-fatal even for artifically injected events
+  + crmd: Wait for all pending operations to complete before poking the policy engine
+  + crmd: When container's host is fenced, cancel in-flight operations
+  + crm_attribute: Correctly update config options when -o crm_config is specified
+  + crm_failcount: Better error reporting when no resource is specified
+  + crm_mon: add exit reason to resource failure output
+  + crm_mon: Fill CRM_notify_node in traps with node's uname rather than node's id if possible
+  + crm_mon: Repair notification delivery when the v2 patch format is in use
+  + crm_node: Correctly remove nodes from the CIB by nodeid
+  + crm_report: More patterns for finding logs on non-DC nodes
+  + crm_resource: Allow resource restart operations to be node specific
+  + crm_resource: avoid deletion of lrm cache on node with resource discovery disabled.
+  + crm_resource: Calculate how long to wait for a restart based on the resource timeouts
+  + crm_resource: Clean up memory in --restart error paths
+  + crm_resource: Display the locations of all anonymous clone children when supplying the children's common ID
+  + crm_resource: Ensure --restart sets/clears meta attributes
+  + crm_resource: Ensure fail-counts are purged when we redetect the state of all resources
+  + crm_resource: Implement --timeout for resource restart operations
+  + crm_resource: Include group members when calculating the next timeout
+  + crm_resource: Memory leak in error paths
+  + crm_resource: Prevent use-after-free
+  + crm_resource: Repair regression test outputs
+  + crm_resource: Use-after-free when restarting a resource
+  + dbus: ref count leaks
+  + dbus: Ensure both the read and write queues get dispatched
+  + dbus: Fail gracefully if malloc fails
+  + dbus: handle dispatch queue when multiple replies need to be processed
+  + dbus: Notice when dbus connections get disabled
+  + dbus: Remove double-free introduced while trying to make coverity shut up
+  + ensure if B is colocated with A, B can never run without A
+  + fence_legacy: Avoid passing 'port' to cluster-glue agents
+  + fencing: Allow nodes to be purged from the member cache
+  + fencing: Correctly make args for fencing agents
+  + fencing: Correctly wait for self-fencing to occur when the watchdog is in use
+  + fencing: Ensure the hostlist parameter is set for watchdog agents
+  + fencing: Force 'stonith-ng' as the system name
+  + fencing: Gracefully handle invalid metadata from agents
+  + fencing: If configured, wait stonith-watchdog-timer seconds for self-fencing to complete
+  + fencing: Reject actions for devices that haven't been explicitly registered yet
+  + ipc: properly allocate server enforced buffer size on client
+  + ipc: use server enforced buffer during ipc client send
+  + lrmd, services: interpret LSB status codes properly
+  + lrmd: add back support for class heartbeat agents
+  + lrmd: cancel pending async connection during disconnect
+  + lrmd: enable ipc proxy for docker-wrapper privileged mode
+  + lrmd: fix rescheduling of systemd monitor op during start
+  + lrmd: Handle systemd reporting 'done' before a resource is actually stopped
+  + lrmd: Hint to child processes that using sd_notify is not required
+  + lrmd: Log with the correct personality
+  + lrmd: Prevent glib assert triggered by timers being removed from mainloop more than once
+  + lrmd: report original timeout when systemd operation completes
+  + lrmd: store failed operation exit reason in cib
+  + mainloop: resolves race condition mainloop poll involving modification of ipc connections
+  + make targetted reprobe for remote node work, crm_resource -C -N <remote node>
+  + mcp: Allow a configurable delay when debugging shutdown issues
+  + mcp: Avoid requiring 'export' for SYS-V sysconfig options
+  + Membership: Detect and resolve nodes that change their ID
+  + pacemakerd: resolves memory leak of xml structure in pacemakerd
+  + pengine: ability to launch resources in isolated containers
+  + pengine: add #kind=remote for baremetal remote-nodes
+  + pengine: allow baremetal remote-nodes to recover without requiring fencing when cluster-node fails
+  + pengine: allow remote-nodes to be placed in maintenance mode
+  + pengine: Avoid trailing whitespaces when printing resource state
+  + pengine: cl#5130 - Choose nodes capable of running all the colocated utilization resources
+  + pengine: cl#5130 - Only check the capacities of the nodes that are allowed to run the resource
+  + pengine: Correctly compare feature set to determine how to unpack meta attributes
+  + pengine: disable migrations for resources with isolation containers
+  + pengine: disable reloading of resources within isolated container wrappers
+  + pengine: Do not aggregate children in a pending state into the started/stopped/etc lists
+  + pengine: Do not record duplicate copies of the failed actions
+  + pengine: Do not reschedule monitors that are no longer needed while resource definitions have changed
+  + pengine: Fence baremetal remote when recurring monitor op fails
+  + pengine: Fix colocation with unmanaged resources
+  + pengine: Fix the behaviors of multi-state resources with asymmetrical ordering
+  + pengine: fixes pengine crash with orphaned remote node connection resource
+  + pengine: fixes segfault caused by malformed log warning
+  + pengine: handle cloned isolated resources in a sane way
+  + pengine: handle isolated resource scenario, cloned group of isolated resources
+  + pengine: Handle ordering between stateful and migratable resources
+  + pengine: imply stop in container node resources when host node is fenced
+  + pengine: only fence baremetal remote when connection can fails or can not be recovered
+  + pengine: only kill process group on timeout when on-fail does not equal block.
+  + pengine: per-node control over resource discovery
+  + pengine: prefer migration target for remote node connections
+  + pengine: prevent disabling rsc discovery per node in certain situations
+  + pengine: Prevent use-after-free in sort_rsc_process_order()
+  + pengine: properly handle ordering during remote connection partial migration
+  + pengine: properly recover remote-nodes when cluster-node proxy goes offline
+  + pengine: remove unnecessary whitespace from notify environment variables
+  + pengine: require-all feature for ordered clones
+  + pengine: Resolve memory leaks
+  + pengine: resource discovery mode for location constraints
+  + pengine: restart master instances on instance attribute changes
+  + pengine: Turn off legacy unpacking of resource options into the meta hashtable
+  + pengine: Watchdog integration is sufficient for fencing
+  + Perform systemd reloads asynchronously
+  + ping: Correctly advertise multiplier default
+  + Prefer to inherit the  watchdog timeout from SBD
+  + properly record stop args after reload
+  + provide fake meta data for ra class heartbeat
+  + remote: report timestamps for remote connection resource operations
+  + remote: Treat recv msg timeout as a disconnect
+  + service: Prevent potential use-of-NULL in metadata lookups
+  + solaris: Allow compilation when dirent.d_type is not available
+  + solaris: Correctly replace the linux swab functions
+  + solaris: Disable throttling since /proc doesn't exist
+  + stonith-ng: Correctly observe the watchdog completion timeout
+  + stonith-ng: Correctly track node state
+  + stonith-ng: Reset mainloop source IDs after removing them
+  + systemd: Correctly handle long running stop actions
+  + systemd: Ensure failed monitor operations always return
+  + systemd: Ensure we don't call dbus_message_unref() with NULL
+  + systemd: fix crash caused when canceling in-flight operation
+  + systemd: Kindly ask dbus NOT to kill the process if the dbus connection fails
+  + systemd: Perform actions asynchronously
+  + systemd: Perform monitor operations without blocking
+  + systemd: Tell systemd not to take DBus down from underneath us
+  + systemd: Trick systemd into not stopping our services before us during shutdown
+  + tools: Improve crm_mon output with certain option combinations
+  + upstart: Monitor actions always return 'ok' or 'not running'
+  + upstart: Perform more parts of monitor operations without blocking
+  + xml: add 'require-all' to xml schema for constraints
+  + xml: cl#5231 - Unset the deleted attributes in the resulting diffs
+  + xml: Clone the latest constraint schema in preparation for changes"
+  + xml: Correctly create v1 patchsets when deleting attributes
+  + xml: Do not change the ordering of properties when applying v1 cib diffs
+  + xml: Do not dump deleted attributes
+  + xml: Do not prune leaves from v1 cib diffs that are being created with digests
+  + xml: Ensure ACLs are reapplied before calculating what a replace operation changed
+  + xml: Fix upgrade-1.3.xsl to correctly transform ACL rules with "attribute"
+  + xml: Prevent assert errors in crm_element_value() on applying a patch without version information
+  + xml: Prevent potential use-of-NULL
+
+
 * Tue Jul 22 2014 Andrew Beekhof <andrew@beekhof.net> Pacemaker-1.1.12-1
 - Update source tarball to revision: 93a037d
 - Changesets: 795
diff --git a/attrd/commands.c b/attrd/commands.c
index 442c5f8..18c0523 100644
--- a/attrd/commands.c
+++ b/attrd/commands.c
@@ -289,6 +289,9 @@ attrd_client_update(xmlNode *xml)
 
             crm_info("Expanded %s=%s to %d", attr, value, int_value);
             crm_xml_add_int(xml, F_ATTRD_VALUE, int_value);
+
+            /* Replacing the value frees the previous memory, so re-query it */
+            value = crm_element_value(xml, F_ATTRD_VALUE);
         }
     }
 
diff --git a/cib/callbacks.c b/cib/callbacks.c
index 71c487e..1452ded 100644
--- a/cib/callbacks.c
+++ b/cib/callbacks.c
@@ -40,6 +40,8 @@
 #include <notify.h>
 #include "common.h"
 
+static unsigned long cib_local_bcast_num = 0;
+
 typedef struct cib_local_notify_s {
     xmlNode *notify_src;
     char *client_id;
@@ -48,7 +50,13 @@ typedef struct cib_local_notify_s {
 } cib_local_notify_t;
 
 int next_client_id = 0;
+
+#if SUPPORT_PLUGIN
+gboolean legacy_mode = TRUE;
+#else
 gboolean legacy_mode = FALSE;
+#endif
+
 qb_ipcs_service_t *ipcs_ro = NULL;
 qb_ipcs_service_t *ipcs_rw = NULL;
 qb_ipcs_service_t *ipcs_shm = NULL;
@@ -82,8 +90,12 @@ static gboolean cib_read_legacy_mode(void)
     return legacy;
 }
 
-static gboolean cib_legacy_mode(void)
+gboolean cib_legacy_mode(void)
 {
+#if SUPPORT_PLUGIN
+    return TRUE;
+#endif
+
     if(cib_read_legacy_mode()) {
         return TRUE;
     }
@@ -442,6 +454,54 @@ do_local_notify(xmlNode * notify_src, const char *client_id,
 }
 
 static void
+local_notify_destroy_callback(gpointer data)
+{
+    cib_local_notify_t *notify = data;
+
+    free_xml(notify->notify_src);
+    free(notify->client_id);
+    free(notify);
+}
+
+static void
+check_local_notify(int bcast_id)
+{
+    cib_local_notify_t *notify = NULL;
+
+    if (!local_notify_queue) {
+        return;
+    }
+
+    notify = g_hash_table_lookup(local_notify_queue, GINT_TO_POINTER(bcast_id));
+
+    if (notify) {
+        do_local_notify(notify->notify_src, notify->client_id, notify->sync_reply,
+                        notify->from_peer);
+        g_hash_table_remove(local_notify_queue, GINT_TO_POINTER(bcast_id));
+    }
+}
+
+static void
+queue_local_notify(xmlNode * notify_src, const char *client_id, gboolean sync_reply,
+                   gboolean from_peer)
+{
+    cib_local_notify_t *notify = calloc(1, sizeof(cib_local_notify_t));
+
+    notify->notify_src = notify_src;
+    notify->client_id = strdup(client_id);
+    notify->sync_reply = sync_reply;
+    notify->from_peer = from_peer;
+
+    if (!local_notify_queue) {
+        local_notify_queue = g_hash_table_new_full(g_direct_hash,
+                                                   g_direct_equal, NULL,
+                                                   local_notify_destroy_callback);
+    }
+
+    g_hash_table_insert(local_notify_queue, GINT_TO_POINTER(cib_local_bcast_num), notify);
+}
+
+static void
 parse_local_options_v1(crm_client_t * cib_client, int call_type, int call_options, const char *host,
                     const char *op, gboolean * local_notify, gboolean * needs_reply,
                     gboolean * process, gboolean * needs_forward)
@@ -814,9 +874,12 @@ send_peer_reply(xmlNode * msg, xmlNode * result_diff, const char *originator, gb
         int diff_del_admin_epoch = 0;
 
         const char *digest = NULL;
+        int format = 1;
 
         CRM_LOG_ASSERT(result_diff != NULL);
         digest = crm_element_value(result_diff, XML_ATTR_DIGEST);
+        crm_element_value_int(result_diff, "format", &format);
+
         cib_diff_version_details(result_diff,
                                  &diff_add_admin_epoch, &diff_add_epoch, &diff_add_updates,
                                  &diff_del_admin_epoch, &diff_del_epoch, &diff_del_updates);
@@ -829,7 +892,9 @@ send_peer_reply(xmlNode * msg, xmlNode * result_diff, const char *originator, gb
         crm_xml_add(msg, F_CIB_GLOBAL_UPDATE, XML_BOOLEAN_TRUE);
         crm_xml_add(msg, F_CIB_OPERATION, CIB_OP_APPLY_DIFF);
 
-        CRM_ASSERT(digest != NULL);
+        if (format == 1) {
+            CRM_ASSERT(digest != NULL);
+        }
 
         add_message_xml(msg, F_CIB_UPDATE_DIFF, result_diff);
         crm_log_xml_explicit(msg, "copy");
@@ -1039,6 +1104,27 @@ cib_process_request(xmlNode * request, gboolean force_synchronous, gboolean priv
          */
         crm_trace("Completed slave update");
 
+    } else if (cib_legacy_mode() &&
+               rc == pcmk_ok && result_diff != NULL && !(call_options & cib_inhibit_bcast)) {
+        gboolean broadcast = FALSE;
+
+        cib_local_bcast_num++;
+        crm_xml_add_int(request, F_CIB_LOCAL_NOTIFY_ID, cib_local_bcast_num);
+        broadcast = send_peer_reply(request, result_diff, originator, TRUE);
+
+        if (broadcast && client_id && local_notify && op_reply) {
+
+            /* If we have been asked to sync the reply,
+             * and a bcast msg has gone out, we queue the local notify
+             * until we know the bcast message has been received */
+            local_notify = FALSE;
+            crm_trace("Queuing local %ssync notification for %s",
+                      (call_options & cib_sync_call) ? "" : "a-", client_id);
+
+            queue_local_notify(op_reply, client_id, (call_options & cib_sync_call), from_peer);
+            op_reply = NULL;    /* the reply is queued, so don't free here */
+        }
+
     } else if (call_options & cib_discard_reply) {
         crm_trace("Caller isn't interested in reply");
 
@@ -1322,6 +1408,11 @@ cib_peer_callback(xmlNode * msg, void *private_data)
 
     if (cib_legacy_mode() && (originator == NULL || crm_str_eq(originator, cib_our_uname, TRUE))) {
         /* message is from ourselves */
+        int bcast_id = 0;
+
+        if (!(crm_element_value_int(msg, F_CIB_LOCAL_NOTIFY_ID, &bcast_id))) {
+            check_local_notify(bcast_id);
+        }
         return;
 
     } else if (crm_peer_cache == NULL) {
diff --git a/cib/callbacks.h b/cib/callbacks.h
index 7549a6c..bca9992 100644
--- a/cib/callbacks.h
+++ b/cib/callbacks.h
@@ -73,6 +73,8 @@ void cib_shutdown(int nsig);
 void initiate_exit(void);
 void terminate_cib(const char *caller, gboolean fast);
 
+extern gboolean cib_legacy_mode(void);
+
 #if SUPPORT_HEARTBEAT
 extern void cib_ha_peer_callback(HA_Message * msg, void *private_data);
 extern int cib_ccm_dispatch(gpointer user_data);
diff --git a/cib/main.c b/cib/main.c
index 2a48054..e20a2b6 100644
--- a/cib/main.c
+++ b/cib/main.c
@@ -438,6 +438,13 @@ cib_peer_update_callback(enum crm_status_type type, crm_node_t * node, const voi
 
     if (cib_shutdown_flag && crm_active_peers() < 2 && crm_hash_table_size(client_connections) == 0) {
         crm_info("No more peers");
+        /* @TODO
+         * terminate_cib() calls crm_cluster_disconnect() which calls
+         * crm_peer_destroy() which destroys the peer caches, which a peer
+         * status callback shouldn't do. For now, there is a workaround in
+         * crm_update_peer_proc(), but CIB should be refactored to avoid
+         * destroying the peer caches here.
+         */
         terminate_cib(__FUNCTION__, FALSE);
     }
 }
diff --git a/cib/messages.c b/cib/messages.c
index 9c66349..363562c 100644
--- a/cib/messages.c
+++ b/cib/messages.c
@@ -297,7 +297,14 @@ cib_process_upgrade_server(const char *op, int options, const char *section, xml
             crm_xml_add(up, F_CIB_CALLOPTS, crm_element_value(req, F_CIB_CALLOPTS));
             crm_xml_add(up, F_CIB_CALLID, crm_element_value(req, F_CIB_CALLID));
 
-            send_cluster_message(NULL, crm_msg_cib, up, FALSE);
+            if (cib_legacy_mode() && cib_is_master) {
+                rc = cib_process_upgrade(
+                    op, options, section, up, input, existing_cib, result_cib, answer);
+
+            } else {
+                send_cluster_message(NULL, crm_msg_cib, up, FALSE);
+            }
+
             free_xml(up);
 
         } else if(rc == pcmk_ok) {
diff --git a/crmd/lrm.c b/crmd/lrm.c
index 74fede4..062f769 100644
--- a/crmd/lrm.c
+++ b/crmd/lrm.c
@@ -454,8 +454,6 @@ get_rsc_metadata(const char *type, const char *rclass, const char *provider, boo
 
     snprintf(key, len, "%s::%s:%s", type, rclass, provider);
     if(force == FALSE) {
-        snprintf(key, len, "%s::%s:%s", type, rclass, provider);
-
         crm_trace("Retreiving cached metadata for %s", key);
         metadata = g_hash_table_lookup(metadata_hash, key);
     }
@@ -581,7 +579,7 @@ resource_supports_action(xmlNode *metadata, const char *name)
     for (action = __xml_first_child(actions); action != NULL; action = __xml_next(action)) {
         if (crm_str_eq((const char *)action->name, "action", TRUE)) {
             value = crm_element_value(action, "name");
-            if (safe_str_eq("reload", value)) {
+            if (safe_str_eq(name, value)) {
                 return TRUE;
             }
         }
@@ -606,16 +604,18 @@ append_restart_list(lrmd_event_data_t *op, xmlNode *metadata, xmlNode * update,
 
     if(resource_supports_action(metadata, "reload")) {
         restart = create_xml_node(NULL, XML_TAG_PARAMS);
-        list = build_parameter_list(op, metadata, restart, "unique", FALSE, FALSE);
-    }
+        /* Any parameters with unique="1" should be added into the "op-force-restart" list. */
+        list = build_parameter_list(op, metadata, restart, "unique", TRUE, FALSE);
 
-    if (list == NULL) {
+    } else {
         /* Resource does not support reloads */
         return;
     }
 
     digest = calculate_operation_digest(restart, version);
-    crm_xml_add(update, XML_LRM_ATTR_OP_RESTART, list);
+    /* Add "op-force-restart" and "op-restart-digest" to indicate the resource supports reload,
+     * no matter if it actually supports any parameters with unique="1"). */
+    crm_xml_add(update, XML_LRM_ATTR_OP_RESTART, list? list: "");
     crm_xml_add(update, XML_LRM_ATTR_RESTART_DIGEST, digest);
 
     crm_trace("%s: %s, %s", op->rsc_id, digest, list);
diff --git a/crmd/throttle.c b/crmd/throttle.c
index 165050c..169594b 100644
--- a/crmd/throttle.c
+++ b/crmd/throttle.c
@@ -92,41 +92,60 @@ int throttle_num_cores(void)
     return cores;
 }
 
+/*
+ * \internal
+ * \brief Return name of /proc file containing the CIB deamon's load statistics
+ *
+ * \return Newly allocated memory with file name on success, NULL otherwise
+ *
+ * \note It is the caller's responsibility to free the return value.
+ *       This will return NULL if the daemon is being run via valgrind.
+ *       This should be called only on Linux systems.
+ */
 static char *find_cib_loadfile(void) 
 {
     DIR *dp;
     struct dirent *entry;
     struct stat statbuf;
     char *match = NULL;
+    char procpath[128];
+    char value[64];
+    char key[16];
 
     dp = opendir("/proc");
     if (!dp) {
         /* no proc directory to search through */
         crm_notice("Can not read /proc directory to track existing components");
-        return FALSE;
+        return NULL;
     }
 
+    /* Iterate through contents of /proc */
     while ((entry = readdir(dp)) != NULL) {
-        char procpath[128];
-        char value[64];
-        char key[16];
         FILE *file;
         int pid;
 
-        strcpy(procpath, "/proc/");
-        /* strlen("/proc/") + strlen("/status") + 1 = 14
-         * 128 - 14 = 114 */
-        strncat(procpath, entry->d_name, 114);
-
-        if (lstat(procpath, &statbuf)) {
+        /* We're only interested in entries whose name is a PID,
+         * so skip anything non-numeric or that is too long.
+         *
+         * 114 = 128 - strlen("/proc/") - strlen("/status") - 1
+         */
+        pid = atoi(entry->d_name);
+        if ((pid <= 0) || (strlen(entry->d_name) > 114)) {
             continue;
         }
-        if (!S_ISDIR(statbuf.st_mode) || !isdigit(entry->d_name[0])) {
+
+        /* We're only interested in subdirectories */
+        strcpy(procpath, "/proc/");
+        strcat(procpath, entry->d_name);
+        if (lstat(procpath, &statbuf) || !S_ISDIR(statbuf.st_mode)) {
             continue;
         }
 
+        /* Read the first entry ("Name:") from the process's status file.
+         * We could handle the valgrind case if we parsed the cmdline file
+         * instead, but that's more of a pain than it's worth.
+         */
         strcat(procpath, "/status");
-
         file = fopen(procpath, "r");
         if (!file) {
             continue;
@@ -137,17 +156,11 @@ static char *find_cib_loadfile(void)
         }
         fclose(file);
 
-        if (safe_str_neq("cib", value)) {
-            continue;
-        }
-
-        pid = atoi(entry->d_name);
-        if (pid <= 0) {
-            continue;
+        if (safe_str_eq("cib", value)) {
+            /* We found the CIB! */
+            match = crm_strdup_printf("/proc/%d/stat", pid);
+            break;
         }
-
-        match = crm_strdup_printf("/proc/%d/stat", pid);
-        break;
     }
 
     closedir(dp);
@@ -214,6 +227,10 @@ static bool throttle_cib_load(float *load)
         last_utime = 0;
         last_stime = 0;
         loadfile = find_cib_loadfile();
+        if (loadfile == NULL) {
+            crm_warn("Couldn't find CIB load file");
+            return FALSE;
+        }
         ticks_per_s = sysconf(_SC_CLK_TCK);
         crm_trace("Found %s", loadfile);
     }
diff --git a/cts/CIB.py b/cts/CIB.py
index cdfc7ca..82d02d7 100644
--- a/cts/CIB.py
+++ b/cts/CIB.py
@@ -312,7 +312,7 @@ Description=Dummy resource that takes a while to start
 Type=notify
 ExecStart=/usr/bin/python -c 'import time, systemd.daemon; time.sleep(10); systemd.daemon.notify("READY=1"); time.sleep(86400)'
 ExecStop=/bin/sleep 10
-ExecStop=/bin/kill -s KILL $MAINPID
+ExecStop=/bin/kill -s KILL \$MAINPID
 """
 
             os.system("cat <<-END >/tmp/DummySD.service\n%s\nEND" % (dummy_service_file))
diff --git a/cts/CTStests.py b/cts/CTStests.py
index 14ab4bf..f817004 100644
--- a/cts/CTStests.py
+++ b/cts/CTStests.py
@@ -1105,7 +1105,7 @@ class MaintenanceMode(CTSTest):
         # fail the resource right after turning Maintenance mode on
         # verify it is not recovered until maintenance mode is turned off
         if action == "On":
-            pats.append("pengine.*: warning: Processing failed op %s for %s on" % (self.action, self.rid))
+            pats.append("pengine.*: warning:.* Processing failed op %s for %s on" % (self.action, self.rid))
         else:
             pats.append(self.templates["Pat:RscOpOK"] % (self.rid, "stop_0"))
             pats.append(self.templates["Pat:RscOpOK"] % (self.rid, "start_0"))
@@ -1314,7 +1314,8 @@ class ResourceRecover(CTSTest):
         self.debug("Shooting %s aka. %s" % (rsc.clone_id, rsc.id))
 
         pats = []
-        pats.append("pengine.*: warning: Processing failed op %s for %s on" % (self.action, self.rid))
+        pats.append(r"pengine.*: warning:.* Processing failed op %s for (%s|%s) on" % (self.action,
+            rsc.id, rsc.clone_id))
 
         if rsc.managed():
             pats.append(self.templates["Pat:RscOpOK"] % (self.rid, "stop_0"))
@@ -2647,32 +2648,31 @@ class RemoteDriver(CTSTest):
         self.remote_node_added = 0
         self.remote_rsc_added = 0
         self.remote_rsc = "remote-rsc"
+        self.remote_use_reconnect_interval = self.Env.RandomGen.choice(["true","false"])
         self.cib_cmd = """cibadmin -C -o %s -X '%s' """
 
-    def del_rsc(self, node, rsc):
-
+    def get_othernode(self, node):
         for othernode in self.Env["nodes"]:
             if othernode == node:
                 # we don't want to try and use the cib that we just shutdown.
                 # find a cluster node that is not our soon to be remote-node.
                 continue
-            rc = self.rsh(othernode, "crm_resource -D -r %s -t primitive" % (rsc))
-            if rc != 0:
-                self.fail_string = ("Removal of resource '%s' failed" % (rsc))
-                self.failed = 1
-            return
+            else:
+                return othernode
+
+    def del_rsc(self, node, rsc):
+        othernode = self.get_othernode(node)
+        rc = self.rsh(othernode, "crm_resource -D -r %s -t primitive" % (rsc))
+        if rc != 0:
+            self.fail_string = ("Removal of resource '%s' failed" % (rsc))
+            self.failed = 1
 
     def add_rsc(self, node, rsc_xml):
-        for othernode in self.CM.Env["nodes"]:
-            if othernode == node:
-                # we don't want to try and use the cib that we just shutdown.
-                # find a cluster node that is not our soon to be remote-node.
-                continue
-            rc = self.rsh(othernode, self.cib_cmd % ("resources", rsc_xml))
-            if rc != 0:
-                self.fail_string = "resource creation failed"
-                self.failed = 1
-            return
+        othernode = self.get_othernode(node)
+        rc = self.rsh(othernode, self.cib_cmd % ("resources", rsc_xml))
+        if rc != 0:
+            self.fail_string = "resource creation failed"
+            self.failed = 1
 
     def add_primitive_rsc(self, node):
         rsc_xml = """
@@ -2687,7 +2687,24 @@ class RemoteDriver(CTSTest):
             self.remote_rsc_added = 1
 
     def add_connection_rsc(self, node):
-        rsc_xml = """
+        if self.remote_use_reconnect_interval == "true":
+            # use reconnect interval and make sure to set cluster-recheck-interval as well.
+            rsc_xml = """
+<primitive class="ocf" id="%s" provider="pacemaker" type="remote">
+    <instance_attributes id="remote-instance_attributes"/>
+        <instance_attributes id="remote-instance_attributes">
+          <nvpair id="remote-instance_attributes-server" name="server" value="%s"/>
+          <nvpair id="remote-instance_attributes-reconnect_interval" name="reconnect_interval" value="60s"/>
+        </instance_attributes>
+    <operations>
+      <op id="remote-monitor-interval-60s" interval="60s" name="monitor"/>
+      <op id="remote-name-start-interval-0-timeout-120" interval="0" name="start" timeout="60"/>
+    </operations>
+</primitive>""" % (self.remote_node, node)
+            self.rsh(self.get_othernode(node), self.templates["SetCheckInterval"] % ("45s"))
+        else:
+            # not using reconnect interval
+            rsc_xml = """
 <primitive class="ocf" id="%s" provider="pacemaker" type="remote">
     <instance_attributes id="remote-instance_attributes"/>
         <instance_attributes id="remote-instance_attributes">
@@ -2698,6 +2715,7 @@ class RemoteDriver(CTSTest):
       <op id="remote-name-start-interval-0-timeout-120" interval="0" name="start" timeout="120"/>
     </operations>
 </primitive>""" % (self.remote_node, node)
+
         self.add_rsc(node, rsc_xml)
         if self.failed == 0:
             self.remote_node_added = 1
@@ -2836,7 +2854,7 @@ class RemoteDriver(CTSTest):
         self.CM.ns.WaitForNodeToComeUp(node, 120);
 
         pats = [ ]
-        watch = self.create_watch(pats, 120)
+        watch = self.create_watch(pats, 200)
         watch.setwatch()
         pats.append(self.templates["Pat:RscOpOK"] % (self.remote_node, "start"))
         if self.remote_rsc_added == 1:
@@ -2927,12 +2945,19 @@ class RemoteDriver(CTSTest):
             pats.append(self.templates["Pat:RscOpOK"] % (self.remote_node, "stop"))
 
         self.set_timer("remoteMetalCleanup")
+
+        if self.remote_use_reconnect_interval == "true":
+            self.debug("Cleaning up re-check interval")
+            self.rsh(self.get_othernode(node), self.templates["ClearCheckInterval"])
         if self.remote_rsc_added == 1:
+            self.debug("Cleaning up dummy rsc put on remote node")
             self.rsh(node, "crm_resource -U -r %s -N %s" % (self.remote_rsc, self.remote_node))
             self.del_rsc(node, self.remote_rsc)
         if self.remote_node_added == 1:
+            self.debug("Cleaning up remote node connection resource")
             self.rsh(node, "crm_resource -U -r %s" % (self.remote_node))
             self.del_rsc(node, self.remote_node)
+
         watch.lookforall()
         self.log_timer("remoteMetalCleanup")
 
diff --git a/cts/environment.py b/cts/environment.py
index 6edf331..a3399c3 100644
--- a/cts/environment.py
+++ b/cts/environment.py
@@ -160,7 +160,7 @@ class Environment:
             self.data["Stack"] = "heartbeat"
 
         elif name == "openais" or name == "ais"  or name == "whitetank":
-            self.data["Stack"] = "openais (whitetank)"
+            self.data["Stack"] = "corosync (plugin v0)"
 
         elif name == "corosync" or name == "cs" or name == "mcp":
             self.data["Stack"] = "corosync 2.x"
@@ -351,6 +351,10 @@ class Environment:
                     self["DoFencing"]=1
                 elif args[i+1] == "0" or args[i+1] == "no":
                     self["DoFencing"]=0
+                elif args[i+1] == "phd":
+                    self["DoStonith"]=1
+                    self["stonith-type"] = "fence_phd_kvm"
+                    self["stonith-params"] = "pcmk_arg_map=domain:uname,delay=0"
                 elif args[i+1] == "rhcs" or args[i+1] == "xvm" or args[i+1] == "virt":
                     self["DoStonith"]=1
                     self["stonith-type"] = "fence_xvm"
diff --git a/cts/patterns.py b/cts/patterns.py
index 8398c7e..1bc05a6 100644
--- a/cts/patterns.py
+++ b/cts/patterns.py
@@ -32,6 +32,9 @@ class BasePatterns:
 
             "UUIDQueryCmd"    : "crmadmin -N",
 
+            "SetCheckInterval"    : "cibadmin --modify -c --xml-text '<cluster_property_set id=\"cib-bootstrap-options\"><nvpair id=\"cts-recheck-interval-setting\" name=\"cluster-recheck-interval\" value=\"%s\"/></cluster_property_set>'",
+            "ClearCheckInterval"    : "cibadmin --delete --xpath \"//nvpair[@name='cluster-recheck-interval']\"",
+
             "MaintenanceModeOn"    : "cibadmin --modify -c --xml-text '<cluster_property_set id=\"cib-bootstrap-options\"><nvpair id=\"cts-maintenance-mode-setting\" name=\"maintenance-mode\" value=\"true\"/></cluster_property_set>'",
             "MaintenanceModeOff"    : "cibadmin --delete --xpath \"//nvpair[@name='maintenance-mode']\"",
 
@@ -291,6 +294,9 @@ class crm_cs_v0(BasePatterns):
             r"error:.*Connection to cib_shm failed",
             r"error:.*Connection to cib_shm.* closed",
             r"error:.*STONITH connection failed",
+            r"error: Connection to stonith-ng failed",
+            r"crit: Fencing daemon connection failed",
+            r"error: Connection to stonith-ng.* closed",
             ]
 
         self.components["corosync"] = [
diff --git a/doc/Pacemaker_Explained/en-US/Ch-Stonith.txt b/doc/Pacemaker_Explained/en-US/Ch-Stonith.txt
index 02525d6..a3c02cb 100644
--- a/doc/Pacemaker_Explained/en-US/Ch-Stonith.txt
+++ b/doc/Pacemaker_Explained/en-US/Ch-Stonith.txt
@@ -343,7 +343,7 @@ http://www.clusterlabs.org/doc/[Clusters from Scratch] guide for those details.
 # cibadmin -C -o resources --xml-file stonith.xml
 ----
 
-. Set stonith-enabled to true:
+. Set +stonith-enabled+ to true:
 +
 ----
 # crm_attribute -t crm_config -n stonith-enabled -v true
@@ -831,3 +831,29 @@ Put together, the configuration looks like this:
   </configuration>
 </cib>
 ----
+
+== Remapping Reboots ==
+
+When the cluster needs to reboot a node, whether because +stonith-action+ is +reboot+ or because
+a reboot was manually requested (such as by `stonith_admin --reboot`), it will remap that to
+other commands in two cases:
+
+. If the chosen fencing device does not support the +reboot+ command, the cluster
+  will ask it to perform +off+ instead.
+
+. If a fencing topology level with multiple devices must be executed, the cluster
+  will ask all the devices to perform +off+, then ask the devices to perform +on+.
+
+To understand the second case, consider the example of a node with redundant
+power supplies connected to intelligent power switches. Rebooting one switch
+and then the other would have no effect on the node. Turning both switches off,
+and then on, actually reboots the node.
+
+In such a case, the fencing operation will be treated as successful as long as
+the +off+ commands succeed, because then it is safe for the cluster to recover
+any resources that were on the node. Timeouts and errors in the +on+ phase will
+be logged but ignored.
+
+When a reboot operation is remapped, any action-specific timeout for the
+remapped action will be used (for example, +pcmk_off_timeout+ will be used when
+executing the +off+ command, not +pcmk_reboot_timeout+).
diff --git a/doc/asciidoc.reference b/doc/asciidoc.reference
index a9a171b..9323864 100644
--- a/doc/asciidoc.reference
+++ b/doc/asciidoc.reference
@@ -1,31 +1,49 @@
+= Single-chapter part of the documentation =
+
+== Go-to reference chapter for how we use AsciiDoc on this project ==
+
+[NOTE]
+======
+This is *not* an attempt for fully self-hosted AsciiDoc document,
+consider it a plaintext full of AsciiDoc samples (it's up to the reader
+to recognize the borderline) at documentation writers' disposal
+to somewhat standardize the style{empty}footnote:[
+  style of both source notation and final visual appearance
+].
+
 See also:
    http://powerman.name/doc/asciidoc
+======
 
-Commands:    `some-tool --with option`
-Files:       '/tmp/file.name' 
-Italic:	     _some text_
+Emphasis:    _some test_
 Mono:        +some text+
-Bold:        *some text*
-Super:	     ^some text^
-Sub:	     ~some text~
+Strong:      *some text*
+Super:       ^some text^
+Sub:         ~some text~
 Quotes:
              ``double quoted''
               `single quoted'
 
-Tool:        command
+Command:     `some-tool --with option`
+Newly introduced term:
+             'some text' (another form of emphasis as of this edit)
+
+File:        mono
 Literal:     mono
+Tool:        command
+Option:      mono
+Replaceable: emphasis mono
 Varname:     mono
-Option:      italic
-Emphasis:    italic bold
-Replaceable: italic mono 
+Term encountered on system (e.g., menu choice, hostname):
+             strong
 
 
-.Title for Eaxmple
+.Title for Example
 =====
 Some text
 =====
 
-.Title for Eaxmple with XML Listing
+.Title for Example with XML Listing
 =====
 [source,XML]
 -----
@@ -49,4 +67,4 @@ Section anchors:
 
 References to section anchors:
 
-<<s-name>> or <<s-name,Alternate Text>>
\ No newline at end of file
+<<s-name>> or <<s-name,Alternate Text>>
diff --git a/doc/shared/en-US/pacemaker-intro.txt b/doc/shared/en-US/pacemaker-intro.txt
index bf432fc..6b898c9 100644
--- a/doc/shared/en-US/pacemaker-intro.txt
+++ b/doc/shared/en-US/pacemaker-intro.txt
@@ -1,41 +1,62 @@
 
-== What Is Pacemaker? ==
+== What Is 'Pacemaker'? ==
 
-Pacemaker is a cluster resource manager.
+Pacemaker is a 'cluster resource manager', that is, a logic responsible
+for a life-cycle of deployed software -- indirectly perhaps even whole
+systems or their interconnections -- under its control within a set of
+computers (a.k.a. 'cluster nodes', 'nodes' for short) and driven by
+prescribed rules.
 
 It achieves maximum availability for your cluster services
-(aka. resources) by detecting and recovering from node- and
+(a.k.a. 'resources') by detecting and recovering from node- and
 resource-level failures by making use of the messaging and membership
 capabilities provided by your preferred cluster infrastructure (either
 http://www.corosync.org/[Corosync] or
-http://linux-ha.org/wiki/Heartbeat[Heartbeat]).
+http://linux-ha.org/wiki/Heartbeat[Heartbeat]), and possibly by
+utilizing other parts of the overall cluster stack.
+
+.High Availability Clusters
+[NOTE]
+For *the goal of minimal downtime* a term 'high availability' was coined
+and together with its acronym, 'HA', is well-established in the sector.
+To differentiate this sort of clusters from high performance computing
+('HPC') ones, should a context require it (apparently, not the case in
+this document), using 'HA cluster' is an option.
 
 Pacemaker's key features include:
 
  * Detection and recovery of node and service-level failures
  * Storage agnostic, no requirement for shared storage
  * Resource agnostic, anything that can be scripted can be clustered
- * Supports fencing (aka. STONITH) for ensuring data integrity
+ * Supports 'fencing' (also referred to as the 'STONITH' acronym,
+   <<s-intro-stonith,deciphered>> later on) for ensuring data integrity
  * Supports large and small clusters
  * Supports both quorate and resource-driven clusters
  * Supports practically any redundancy configuration
- * Automatically replicated configuration that can be updated from any node
- * Ability to specify cluster-wide service ordering, colocation and anti-colocation
+ * Automatically replicated configuration that can be updated
+   from any node
+ * Ability to specify cluster-wide service ordering,
+   colocation and anti-colocation
  * Support for advanced service types
  ** Clones: for services which need to be active on multiple nodes
- ** Multi-state: for services with multiple modes (eg. master/slave, primary/secondary)
- * Unified, scriptable, cluster management tools.
+ ** Multi-state: for services with multiple modes
+    (e.g. master/slave, primary/secondary)
+ * Unified, scriptable cluster management tools
 
 == Pacemaker Architecture ==
 
 At the highest level, the cluster is made up of three pieces:
 
- * Non-cluster-aware components. These pieces
+ * *Non-cluster-aware components*. These pieces
    include the resources themselves; scripts that start, stop and
    monitor them; and a local daemon that masks the differences
    between the different standards these scripts implement.
+   Even though interactions of these resources when run as multiple
+   instances can resemble a distributed system, they still lack
+   the proper HA mechanisms and/or autonomous cluster-wide governance
+   as subsumed in the following item.
 
- * Resource management. Pacemaker provides the brain that processes
+ * *Resource management*. Pacemaker provides the brain that processes
    and reacts to events regarding the cluster.  These events include
    nodes joining or leaving the cluster; resource events caused by
    failures, maintenance and scheduled activities; and other
@@ -44,21 +65,24 @@ At the highest level, the cluster is made up of three pieces:
    events. This may include moving resources, stopping nodes and even
    forcing them offline with remote power switches.
 
- * Low-level infrastructure. Projects like Corosync, CMAN and
-   Heartbeat provide reliable messaging, membership and quorum
+ * *Low-level infrastructure*. Projects like 'Corosync', 'CMAN' and
+   'Heartbeat' provide reliable messaging, membership and quorum
    information about the cluster.
 
 When combined with Corosync, Pacemaker also supports popular open
-source cluster filesystems.
-footnote:[Even though Pacemaker also supports Heartbeat, the filesystems need
-to use the stack for messaging and membership, and Corosync seems to be
-what they're standardizing on. Technically, it would be possible for them to
-support Heartbeat as well, but there seems little interest in this.]
+source cluster filesystems.{empty}footnote:[
+  Even though Pacemaker also supports Heartbeat, the filesystems need to
+  use the stack for messaging and membership, and Corosync seems to be
+  what they're standardizing on.  Technically, it would be possible for
+  them to support Heartbeat as well, but there seems little interest
+  in this.
+]
 
 Due to past standardization within the cluster filesystem community,
-cluster filesystems make use of a common distributed lock manager, which makes
-use of Corosync for its messaging and membership capabilities (which nodes
-are up/down) and Pacemaker for fencing services.
+cluster filesystems make use of a common 'distributed lock manager',
+which makes use of Corosync for its messaging and membership
+capabilities (which nodes are up/down) and Pacemaker for fencing
+services.
 
 .The Pacemaker Stack
 image::images/pcmk-stack.png["The Pacemaker stack",width="10cm",height="7.5cm",align="center"]
@@ -67,75 +91,79 @@ image::images/pcmk-stack.png["The Pacemaker stack",width="10cm",height="7.5cm",a
 
 Pacemaker itself is composed of five key components:
 
- * Cluster Information Base (CIB)
- * Cluster Resource Management daemon (CRMd)
- * Local Resource Management daemon (LRMd)
- * Policy Engine (PEngine or PE)
- * Fencing daemon (STONITHd)
+ * 'Cluster Information Base' ('CIB')
+ * 'Cluster Resource Management daemon' ('CRMd')
+ * 'Local Resource Management daemon' ('LRMd')
+ * 'Policy Engine' ('PEngine' or 'PE')
+ * Fencing daemon ('STONITHd')
 
 .Internal Components
 image::images/pcmk-internals.png["Subsystems of a Pacemaker cluster",align="center",scaledwidth="65%"]
 
 The CIB uses XML to represent both the cluster's configuration and
 current state of all resources in the cluster. The contents of the CIB
-are automatically kept in sync across the entire cluster and are used
-by the PEngine to compute the ideal state of the cluster and how it
-should be achieved.
+are automatically kept in sync across the entire cluster and are used by
+the PEngine to compute the ideal state of the cluster and how it should
+be achieved.
 
-This list of instructions is then fed to the Designated
-Controller (DC).  Pacemaker centralizes all cluster decision making by
-electing one of the CRMd instances to act as a master. Should the
-elected CRMd process (or the node it is on) fail, a new one is
-quickly established.
+This list of instructions is then fed to the 'Designated Controller'
+('DC').  Pacemaker centralizes all cluster decision making by electing
+one of the CRMd instances to act as a master. Should the elected CRMd
+process (or the node it is on) fail, a new one is quickly established.
 
 The DC carries out the PEngine's instructions in the required order by
 passing them to either the Local Resource Management daemon (LRMd) or
 CRMd peers on other nodes via the cluster messaging infrastructure
 (which in turn passes them on to their LRMd process).
 
-The peer nodes all report the results of their operations back to the
-DC and, based on the expected and actual results, will either execute
-any actions that needed to wait for the previous one to complete, or
-abort processing and ask the PEngine to recalculate the ideal cluster
-state based on the unexpected results.
+The peer nodes all report the results of their operations back to the DC
+and, based on the expected and actual results, will either execute any
+actions that needed to wait for the previous one to complete, or abort
+processing and ask the PEngine to recalculate the ideal cluster state
+based on the unexpected results.
 
 In some cases, it may be necessary to power off nodes in order to
 protect shared data or complete resource recovery. For this, Pacemaker
 comes with STONITHd. 
 
-STONITH is an acronym for Shoot-The-Other-Node-In-The-Head and is
-usually implemented with a remote power switch.
+[[s-intro-stonith]]
+.STONITH
+[NOTE]
+*STONITH* is an acronym for 'Shoot-The-Other-Node-In-The-Head',
+a recommended practice that misbehaving node is best to be promptly
+'fenced' (shut off, cut from shared resources or otherwise immobilized),
+and is usually implemented with a remote power switch.
 
 In Pacemaker, STONITH devices are modeled as resources (and configured
 in the CIB) to enable them to be easily monitored for failure, however
-STONITHd takes care of understanding the STONITH topology such that
-its clients simply request a node be fenced, and it does the rest.
+STONITHd takes care of understanding the STONITH topology such that its
+clients simply request a node be fenced, and it does the rest.
 
 == Types of Pacemaker Clusters ==
 
 Pacemaker makes no assumptions about your environment. This allows it
 to support practically any
 http://en.wikipedia.org/wiki/High-availability_cluster#Node_configurations[redundancy
-configuration] including Active/Active, Active/Passive, N+1, N+M,
-N-to-1 and N-to-N.
+configuration] including 'Active/Active', 'Active/Passive', 'N+1',
+'N+M', 'N-to-1' and 'N-to-N'.
 
 .Active/Passive Redundancy
 image::images/pcmk-active-passive.png["Active/Passive Redundancy",width="10cm",height="7.5cm",align="center"]
 
-Two-node Active/Passive clusters using Pacemaker and DRBD are a
-cost-effective solution for many High Availability situations.
+Two-node Active/Passive clusters using Pacemaker and 'DRBD' are
+a cost-effective solution for many High Availability situations.
 
 .Shared Failover
 image::images/pcmk-shared-failover.png["Shared Failover",width="10cm",height="7.5cm",align="center"]
 
 By supporting many nodes, Pacemaker can dramatically reduce hardware
 costs by allowing several active/passive clusters to be combined and
-share a common backup node
+share a common backup node.
 
 .N to N Redundancy
 image::images/pcmk-active-active.png["N to N Redundancy",width="10cm",height="7.5cm",align="center"]
 
-When shared storage is available, every node can potentially be used
-for failover.  Pacemaker can even run multiple copies of services to
-spread out the workload.
+When shared storage is available, every node can potentially be used for
+failover.  Pacemaker can even run multiple copies of services to spread
+out the workload.
 
diff --git a/extra/resources/Dummy b/extra/resources/Dummy
index aec2a0c..8a38ef5 100644
--- a/extra/resources/Dummy
+++ b/extra/resources/Dummy
@@ -137,7 +137,7 @@ dummy_stop() {
     if [ $? =  $OCF_SUCCESS ]; then
 	rm ${OCF_RESKEY_state}
     fi
-    rm ${VERIFY_SERIALIZED_FILE}
+    rm -f ${VERIFY_SERIALIZED_FILE}
     return $OCF_SUCCESS
 }
 
diff --git a/extra/resources/ping b/extra/resources/ping
index e7b9973..ca9db75 100755
--- a/extra/resources/ping
+++ b/extra/resources/ping
@@ -43,8 +43,7 @@ meta_data() {
 <version>1.0</version>
 
 <longdesc lang="en">
-Every time the monitor action is run, this resource agent records (in the CIB) the current number of ping nodes the host can connect to.
-It is essentially the same as pingd except that it uses the system ping tool to obtain the results.
+Every time the monitor action is run, this resource agent records (in the CIB) the current number of nodes the host can connect to using the system fping (preferred) or ping tool.
 </longdesc>
 <shortdesc lang="en">node connectivity</shortdesc>
 
diff --git a/fencing/README.md b/fencing/README.md
new file mode 100644
index 0000000..a50c69b
--- /dev/null
+++ b/fencing/README.md
@@ -0,0 +1,145 @@
+# Directory contents
+
+* `admin.c`, `stonith_admin.8`: `stonith_admin` command-line tool and its man
+  page
+* `commands.c`, `internal.h`, `main.c`, `remote.c`, `stonithd.7`: stonithd and
+  its man page
+* `fence_dummy`, `fence_legacy`, `fence_legacy.8`, `fence_pcmk`,
+  `fence_pcmk.8`: Pacemaker-supplied fence agents and their man pages
+* `regression.py(.in)`: regression tests for `stonithd`
+* `standalone_config.c`, `standalone_config.h`: abandoned project
+* `test.c`: `stonith-test` command-line tool
+
+# How fencing requests are handled
+
+## Bird's eye view
+
+In the broadest terms, stonith works like this:
+
+1. The initiator (an external program such as `stonith_admin`, or the cluster
+   itself via the `crmd`) asks the local `stonithd`, "Hey, can you fence this
+   node?"
+1. The local `stonithd` asks all the `stonithd's` in the cluster (including
+   itself), "Hey, what fencing devices do you have access to that can fence
+   this node?"
+1. Each `stonithd` in the cluster replies with a list of available devices that
+   it knows about.
+1. Once the original `stonithd` gets all the replies, it asks the most
+   appropriate `stonithd` peer to actually carry out the fencing. It may send
+   out more than one such request if the target node must be fenced with
+   multiple devices.
+1. The chosen `stonithd(s)` call the appropriate fencing resource agent(s) to
+   do the fencing, then replies to the original `stonithd` with the result.
+1. The original `stonithd` broadcasts the result to all `stonithd's`.
+1. Each `stonithd` sends the result to each of its local clients (including, at
+   some point, the initiator).
+
+## Detailed view
+
+### Initiating a fencing request
+
+A fencing request can be initiated by the cluster or externally, using the
+libfencing API.
+
+* The cluster always initiates fencing via `crmd/te_actions.c:te_fence_node()`
+  (which calls the `fence()` API). This occurs when a graph synapse contains a
+  `CRM_OP_FENCE` XML operation.
+* The main external clients are `stonith_admin` and `stonith-test`.
+
+Highlights of the fencing API:
+* `stonith_api_new()` creates and returns a new `stonith_t` object, whose
+  `cmds` member has methods for connect, disconnect, fence, etc.
+* the `fence()` method creates and sends a `STONITH_OP_FENCE XML` request with
+  the desired action and target node. Callers do not have to choose or even
+  have any knowledge about particular fencing devices.
+
+### Fencing queries
+
+The function calls for a stonith request go something like this as of this writing:
+
+The local `stonithd` receives the client's request via an IPC or messaging
+layer callback, which calls
+* `stonith_command()`, which (for requests) calls
+  * `handle_request()`, which (for `STONITH_OP_FENCE` from a client) calls
+    * `initiate_remote_stonith_op()`, which creates a `STONITH_OP_QUERY` XML
+      request with the target, desired action, timeout, etc.. then broadcasts
+      the operation to the cluster group (i.e. all `stonithd` instances) and
+      starts a timer. The query is broadcast because (1) location constraints
+      might prevent the local node from accessing the stonith device directly,
+      and (2) even if the local node does have direct access, another node
+      might be preferred to carry out the fencing.
+
+Each `stonithd` receives the original `stonithd's STONITH_OP_QUERY` broadcast
+request via IPC or messaging layer callback, which calls:
+* `stonith_command()`, which (for requests) calls
+  *  `handle_request()`, which (for `STONITH_OP_QUERY` from a peer) calls
+    * `stonith_query()`, which calls
+      * `get_capable_devices()` with `stonith_query_capable_device_db()` to add
+        device information to an XML reply and send it. (A message is
+	considered a reply if it contains `T_STONITH_REPLY`, which is only set
+        by `stonithd` peers, not clients.)
+
+The original `stonithd` receives all peers' `STONITH_OP_QUERY` replies via IPC
+or messaging layer callback, which calls:
+* `stonith_command()`, which (for replies) calls
+  * `handle_reply()` which (for `STONITH_OP_QUERY`) calls
+    * `process_remote_stonith_query()`, which allocates a new query result
+      structure, parses device information into it, and adds it to operation
+      object. It increments the number of replies received for this operation,
+      and compares it against the expected number of replies (i.e. the number
+      of active peers), and if this is the last expected reply, calls
+      * `call_remote_stonith()`, which calculates the timeout and sends
+        `STONITH_OP_FENCE` request(s) to carry out the fencing. If the target
+	node has a fencing "topology" (which allows specifications such as
+	"this node can be fenced either with device A, or devices B and C in
+	combination"), it will choose the device(s), and send out as many
+	requests as needed. If it chooses a device, it will choose the peer; a
+	peer is preferred if it has "verified" access to the desired device,
+	meaning that it has the device "running" on it and thus has a monitor
+        operation ensuring reachability.
+
+### Fencing operations
+
+Each `STONITH_OP_FENCE` request goes something like this as of this writing:
+
+The chosen peer `stonithd` receives the `STONITH_OP_FENCE` request via IPC or
+messaging layer callback, which calls:
+* `stonith_command()`, which (for requests) calls
+  * `handle_request()`, which (for `STONITH_OP_FENCE` from a peer) calls
+    * `stonith_fence()`, which calls
+      * `schedule_stonith_command()` (using supplied device if
+        `F_STONITH_DEVICE` was set, otherwise the highest-priority capable
+	device obtained via `get_capable_devices()` with
+	`stonith_fence_get_devices_cb()`), which adds the operation to the
+        device's pending operations list and triggers processing.
+
+The chosen peer `stonithd's` mainloop is triggered and calls
+* `stonith_device_dispatch()`, which calls
+  * `stonith_device_execute()`, which pops off the next item from the device's
+    pending operations list. If acting as the (internally implemented) watchdog
+    agent, it panics the node, otherwise it calls
+    * `stonith_action_create()` and `stonith_action_execute_async()` to call the fencing agent.
+
+The chosen peer stonithd's mainloop is triggered again once the fencing agent returns, and calls
+* `stonith_action_async_done()` which adds the results to an action object then calls its
+  * done callback (`st_child_done()`), which calls `schedule_stonith_command()`
+    for a new device if there are further required actions to execute or if the
+    original action failed, then builds and sends an XML reply to the original
+    `stonithd` (via `stonith_send_async_reply()`), then checks whether any
+    pending actions are the same as the one just executed and merges them if so.
+
+### Fencing replies
+
+The original `stonithd` receives the `STONITH_OP_FENCE` reply via IPC or
+messaging layer callback, which calls:
+* `stonith_command()`, which (for replies) calls
+  * `handle_reply()`, which calls
+    * `process_remote_stonith_exec()`, which calls either
+      `call_remote_stonith()` (to retry a failed operation, or try the next
+       device in a topology is appropriate, which issues a new
+      `STONITH_OP_FENCE` request, proceeding as before) or `remote_op_done()`
+      (if the operation is definitively failed or successful).
+      * remote_op_done() broadcasts the result to all peers.
+
+Finally, all peers receive the broadcast result and call
+* `remote_op_done()`, which sends the result to all local clients.
diff --git a/fencing/commands.c b/fencing/commands.c
index c9975d3..0d2d614 100644
--- a/fencing/commands.c
+++ b/fencing/commands.c
@@ -53,15 +53,24 @@ GHashTable *topology = NULL;
 GList *cmd_list = NULL;
 
 struct device_search_s {
+    /* target of fence action */
     char *host;
+    /* requested fence action */
     char *action;
+    /* timeout to use if a device is queried dynamically for possible targets */
     int per_device_timeout;
+    /* number of registered fencing devices at time of request */
     int replies_needed;
+    /* number of device replies received so far */
     int replies_received;
+    /* whether the target is eligible to perform requested action (or off) */
     bool allow_suicide;
 
+    /* private data to pass to search callback function */
     void *user_data;
+    /* function to call when all replies have been received */
     void (*callback) (GList * devices, void *user_data);
+    /* devices capable of performing requested action (or off if remapping) */
     GListPtr capable;
 };
 
@@ -173,6 +182,17 @@ get_action_timeout(stonith_device_t * device, const char *action, int default_ti
         char buffer[64] = { 0, };
         const char *value = NULL;
 
+        /* If "reboot" was requested but the device does not support it,
+         * we will remap to "off", so check timeout for "off" instead
+         */
+        if (safe_str_eq(action, "reboot")
+            && is_not_set(device->flags, st_device_supports_reboot)) {
+            crm_trace("%s doesn't support reboot, using timeout for off instead",
+                      device->id);
+            action = "off";
+        }
+
+        /* If the device config specified an action-specific timeout, use it */
         snprintf(buffer, sizeof(buffer) - 1, "pcmk_%s_timeout", action);
         value = g_hash_table_lookup(device->params, buffer);
         if (value) {
@@ -1241,6 +1261,38 @@ search_devices_record_result(struct device_search_s *search, const char *device,
     }
 }
 
+/*
+ * \internal
+ * \brief Check whether the local host is allowed to execute a fencing action
+ *
+ * \param[in] device         Fence device to check
+ * \param[in] action         Fence action to check
+ * \param[in] target         Hostname of fence target
+ * \param[in] allow_suicide  Whether self-fencing is allowed for this operation
+ *
+ * \return TRUE if local host is allowed to execute action, FALSE otherwise
+ */
+static gboolean
+localhost_is_eligible(const stonith_device_t *device, const char *action,
+                      const char *target, gboolean allow_suicide)
+{
+    gboolean localhost_is_target = safe_str_eq(target, stonith_our_uname);
+
+    if (device && action && device->on_target_actions
+        && strstr(device->on_target_actions, action)) {
+        if (!localhost_is_target) {
+            crm_trace("%s operation with %s can only be executed for localhost not %s",
+                      action, device->id, target);
+            return FALSE;
+        }
+
+    } else if (localhost_is_target && !allow_suicide) {
+        crm_trace("%s operation does not support self-fencing", action);
+        return FALSE;
+    }
+    return TRUE;
+}
+
 static void
 can_fence_host_with_device(stonith_device_t * dev, struct device_search_s *search)
 {
@@ -1258,19 +1310,20 @@ can_fence_host_with_device(stonith_device_t * dev, struct device_search_s *searc
         goto search_report_results;
     }
 
-    if (dev->on_target_actions &&
-        search->action &&
-        strstr(dev->on_target_actions, search->action)) {
-        /* this device can only execute this action on the target node */
-
-        if(safe_str_neq(host, stonith_our_uname)) {
-            crm_trace("%s operation with %s can only be executed for localhost not %s",
-                      search->action, dev->id, host);
+    /* Short-circuit query if this host is not allowed to perform the action */
+    if (safe_str_eq(search->action, "reboot")) {
+        /* A "reboot" *might* get remapped to "off" then "on", so short-circuit
+         * only if all three are disallowed. If only one or two are disallowed,
+         * we'll report that with the results. We never allow suicide for
+         * remapped "on" operations because the host is off at that point.
+         */
+        if (!localhost_is_eligible(dev, "reboot", host, search->allow_suicide)
+            && !localhost_is_eligible(dev, "off", host, search->allow_suicide)
+            && !localhost_is_eligible(dev, "on", host, FALSE)) {
             goto search_report_results;
         }
-
-    } else if(safe_str_eq(host, stonith_our_uname) && search->allow_suicide == FALSE) {
-        crm_trace("%s operation does not support self-fencing", search->action);
+    } else if (!localhost_is_eligible(dev, search->action, host,
+                                      search->allow_suicide)) {
         goto search_report_results;
     }
 
@@ -1423,6 +1476,85 @@ struct st_query_data {
     int call_options;
 };
 
+/*
+ * \internal
+ * \brief Add action-specific attributes to query reply XML
+ *
+ * \param[in,out] xml     XML to add attributes to
+ * \param[in]     action  Fence action
+ * \param[in]     device  Fence device
+ */
+static void
+add_action_specific_attributes(xmlNode *xml, const char *action,
+                               stonith_device_t *device)
+{
+    int action_specific_timeout;
+    int delay_max;
+
+    CRM_CHECK(xml && action && device, return);
+
+    if (is_action_required(action, device)) {
+        crm_trace("Action %s is required on %s", action, device->id);
+        crm_xml_add_int(xml, F_STONITH_DEVICE_REQUIRED, 1);
+    }
+
+    action_specific_timeout = get_action_timeout(device, action, 0);
+    if (action_specific_timeout) {
+        crm_trace("Action %s has timeout %dms on %s",
+                  action, action_specific_timeout, device->id);
+        crm_xml_add_int(xml, F_STONITH_ACTION_TIMEOUT, action_specific_timeout);
+    }
+
+    delay_max = get_action_delay_max(device, action);
+    if (delay_max > 0) {
+        crm_trace("Action %s has maximum random delay %dms on %s",
+                  action, delay_max, device->id);
+        crm_xml_add_int(xml, F_STONITH_DELAY_MAX, delay_max / 1000);
+    }
+}
+
+/*
+ * \internal
+ * \brief Add "disallowed" attribute to query reply XML if appropriate
+ *
+ * \param[in,out] xml            XML to add attribute to
+ * \param[in]     action         Fence action
+ * \param[in]     device         Fence device
+ * \param[in]     target         Fence target
+ * \param[in]     allow_suicide  Whether self-fencing is allowed
+ */
+static void
+add_disallowed(xmlNode *xml, const char *action, stonith_device_t *device,
+               const char *target, gboolean allow_suicide)
+{
+    if (!localhost_is_eligible(device, action, target, allow_suicide)) {
+        crm_trace("Action %s on %s is disallowed for local host",
+                  action, device->id);
+        crm_xml_add(xml, F_STONITH_ACTION_DISALLOWED, XML_BOOLEAN_TRUE);
+    }
+}
+
+/*
+ * \internal
+ * \brief Add child element with action-specific values to query reply XML
+ *
+ * \param[in,out] xml            XML to add attribute to
+ * \param[in]     action         Fence action
+ * \param[in]     device         Fence device
+ * \param[in]     target         Fence target
+ * \param[in]     allow_suicide  Whether self-fencing is allowed
+ */
+static void
+add_action_reply(xmlNode *xml, const char *action, stonith_device_t *device,
+               const char *target, gboolean allow_suicide)
+{
+    xmlNode *child = create_xml_node(xml, F_STONITH_ACTION);
+
+    crm_xml_add(child, XML_ATTR_ID, action);
+    add_action_specific_attributes(child, action, device);
+    add_disallowed(child, action, device, target, allow_suicide);
+}
+
 static void
 stonith_query_capable_device_cb(GList * devices, void *user_data)
 {
@@ -1432,13 +1564,12 @@ stonith_query_capable_device_cb(GList * devices, void *user_data)
     xmlNode *list = NULL;
     GListPtr lpc = NULL;
 
-    /* Pack the results into data */
+    /* Pack the results into XML */
     list = create_xml_node(NULL, __FUNCTION__);
     crm_xml_add(list, F_STONITH_TARGET, query->target);
     for (lpc = devices; lpc != NULL; lpc = lpc->next) {
         stonith_device_t *device = g_hash_table_lookup(device_list, lpc->data);
-        int action_specific_timeout;
-        int delay_max;
+        const char *action = query->action;
 
         if (!device) {
             /* It is possible the device got unregistered while
@@ -1448,24 +1579,44 @@ stonith_query_capable_device_cb(GList * devices, void *user_data)
 
         available_devices++;
 
-        action_specific_timeout = get_action_timeout(device, query->action, 0);
         dev = create_xml_node(list, F_STONITH_DEVICE);
         crm_xml_add(dev, XML_ATTR_ID, device->id);
         crm_xml_add(dev, "namespace", device->namespace);
         crm_xml_add(dev, "agent", device->agent);
         crm_xml_add_int(dev, F_STONITH_DEVICE_VERIFIED, device->verified);
-        if (is_action_required(query->action, device)) {
-            crm_xml_add_int(dev, F_STONITH_DEVICE_REQUIRED, 1);
-        }
-        if (action_specific_timeout) {
-            crm_xml_add_int(dev, F_STONITH_ACTION_TIMEOUT, action_specific_timeout);
+
+        /* If the originating stonithd wants to reboot the node, and we have a
+         * capable device that doesn't support "reboot", remap to "off" instead.
+         */
+        if (is_not_set(device->flags, st_device_supports_reboot)
+            && safe_str_eq(query->action, "reboot")) {
+            crm_trace("%s doesn't support reboot, using values for off instead",
+                      device->id);
+            action = "off";
         }
 
-        delay_max = get_action_delay_max(device, query->action);
-        if (delay_max > 0) {
-            crm_xml_add_int(dev, F_STONITH_DELAY_MAX, delay_max / 1000);
+        /* Add action-specific values if available */
+        add_action_specific_attributes(dev, action, device);
+        if (safe_str_eq(query->action, "reboot")) {
+            /* A "reboot" *might* get remapped to "off" then "on", so after
+             * sending the "reboot"-specific values in the main element, we add
+             * sub-elements for "off" and "on" values.
+             *
+             * We short-circuited earlier if "reboot", "off" and "on" are all
+             * disallowed for the local host. However if only one or two are
+             * disallowed, we send back the results and mark which ones are
+             * disallowed. If "reboot" is disallowed, this might cause problems
+             * with older stonithd versions, which won't check for it. Older
+             * versions will ignore "off" and "on", so they are not a problem.
+             */
+            add_disallowed(dev, action, device, query->target,
+                           is_set(query->call_options, st_opt_allow_suicide));
+            add_action_reply(dev, "off", device, query->target,
+                             is_set(query->call_options, st_opt_allow_suicide));
+            add_action_reply(dev, "on", device, query->target, FALSE);
         }
 
+        /* A query without a target wants device parameters */
         if (query->target == NULL) {
             xmlNode *attrs = create_xml_node(dev, XML_TAG_ATTRS);
 
@@ -1481,7 +1632,7 @@ stonith_query_capable_device_cb(GList * devices, void *user_data)
     }
 
     if (list != NULL) {
-        crm_trace("Attaching query list output");
+        crm_log_xml_trace(list, "Add query results");
         add_message_xml(query->reply, F_STONITH_CALLDATA, list);
     }
     stonith_send_reply(query->reply, query->call_options, query->remote_peer, query->client_id);
@@ -1766,6 +1917,14 @@ st_child_done(GPid pid, int rc, const char *output, gpointer user_data)
             continue;
         }
 
+        /* Duplicate merging will do the right thing for either type of remapped
+         * reboot. If the executing stonithd remapped an unsupported reboot to
+         * off, then cmd->action will be reboot and will be merged with any
+         * other reboot requests. If the originating stonithd remapped a
+         * topology reboot to off then on, we will get here once with
+         * cmd->action "off" and once with "on", and they will be merged
+         * separately with similar requests.
+         */
         crm_notice
             ("Merging stonith action %s for node %s originating from client %s with identical stonith request from client %s",
              cmd_other->action, cmd_other->victim, cmd_other->client_name, cmd->client_name);
diff --git a/fencing/internal.h b/fencing/internal.h
index 46bd3bf..5fb8f9c 100644
--- a/fencing/internal.h
+++ b/fencing/internal.h
@@ -51,6 +51,17 @@ typedef struct stonith_device_s {
     gboolean api_registered;
 } stonith_device_t;
 
+/* These values are used to index certain arrays by "phase". Usually an
+ * operation has only one "phase", so phase is always zero. However, some
+ * reboots are remapped to "off" then "on", in which case "reboot" will be
+ * phase 0, "off" will be phase 1 and "on" will be phase 2.
+ */
+enum st_remap_phase {
+    st_phase_requested = 0,
+    st_phase_off = 1,
+    st_phase_on = 2
+};
+
 typedef struct remote_fencing_op_s {
     /* The unique id associated with this operation */
     char *id;
@@ -97,7 +108,7 @@ typedef struct remote_fencing_op_s {
     long long call_options;
 
     /*! The current state of the remote operation. This indicates
-     * what phase the op is in, query, exec, done, duplicate, failed. */
+     * what stage the op is in, query, exec, done, duplicate, failed. */
     enum op_state state;
     /*! The node that owns the remote operation */
     char *originator;
@@ -114,10 +125,17 @@ typedef struct remote_fencing_op_s {
 
     /*! The current topology level being executed */
     guint level;
-
-    /*! List of required devices the topology must execute regardless of what
-     * topology level they exist at. */
-    GListPtr required_list;
+    /*! The current operation phase being executed */
+    enum st_remap_phase phase;
+
+    /* For phase 0 or 1 (requested action or a remapped "off"), required devices
+     * will be executed regardless of what topology level is being executed
+     * currently. For phase 1 (remapped "on"), required devices will not be
+     * attempted, because the cluster will execute them automatically when the
+     * node next joins the cluster.
+     */
+    /*! Lists of devices marked as required for each phase */
+    GListPtr required_list[3];
     /*! The device list of all the devices at the current executing topology level. */
     GListPtr devices_list;
     /*! Current entry in the topology device list */
@@ -129,6 +147,20 @@ typedef struct remote_fencing_op_s {
 
 } remote_fencing_op_t;
 
+/*
+ * Complex fencing requirements are specified via fencing topologies.
+ * A topology consists of levels; each level is a list of fencing devices.
+ * Topologies are stored in a hash table by node name. When a node needs to be
+ * fenced, if it has an entry in the topology table, the levels are tried
+ * sequentially, and the devices in each level are tried sequentially.
+ * Fencing is considered successful as soon as any level succeeds;
+ * a level is considered successful if all its devices succeed.
+ * Essentially, all devices at a given level are "and-ed" and the
+ * levels are "or-ed".
+ *
+ * This structure is used for the topology table entries.
+ * Topology levels start from 1, so levels[0] is unused and always NULL.
+ */
 typedef struct stonith_topology_s {
     char *node;
     GListPtr levels[ST_LEVEL_MAX];
diff --git a/fencing/main.c b/fencing/main.c
index a499175..46d7352 100644
--- a/fencing/main.c
+++ b/fencing/main.c
@@ -1234,7 +1234,7 @@ struct qb_ipcs_service_handlers ipc_callbacks = {
 static void
 st_peer_update_callback(enum crm_status_type type, crm_node_t * node, const void *data)
 {
-    if (type == crm_status_uname) {
+    if (type != crm_status_processes) {
         /*
          * This is a hack until we can send to a nodeid and/or we fix node name lookups
          * These messages are ignored in stonith_peer_callback()
diff --git a/fencing/regression.py.in b/fencing/regression.py.in
index fe6d418..b4e6f08 100644
--- a/fencing/regression.py.in
+++ b/fencing/regression.py.in
@@ -23,861 +23,937 @@ import shlex
 import time
 
 def output_from_command(command):
-	test = subprocess.Popen(shlex.split(command), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-	test.wait()
+    test = subprocess.Popen(shlex.split(command), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    test.wait()
 
-	return test.communicate()[0].split("\n")
+    return test.communicate()[0].split("\n")
 
 class Test:
-	def __init__(self, name, description, verbose = 0, with_cpg = 0):
-		self.name = name
-		self.description = description
-		self.cmds = []
-		self.verbose = verbose
+    def __init__(self, name, description, verbose = 0, with_cpg = 0):
+        self.name = name
+        self.description = description
+        self.cmds = []
+        self.verbose = verbose
 
-		self.result_txt = ""
-		self.cmd_tool_output = ""
-		self.result_exitcode = 0;
+        self.result_txt = ""
+        self.cmd_tool_output = ""
+        self.result_exitcode = 0;
 
-		self.stonith_options = "-s"
-		self.enable_corosync = 0
+        self.stonith_options = "-s"
+        self.enable_corosync = 0
 
-		if with_cpg:
-			self.stonith_options = "-c"
-			self.enable_corosync = 1
+        if with_cpg:
+            self.stonith_options = "-c"
+            self.enable_corosync = 1
 
-		self.stonith_process = None
-		self.stonith_output = ""
-		self.stonith_patterns = []
-		self.negative_stonith_patterns = []
+        self.stonith_process = None
+        self.stonith_output = ""
+        self.stonith_patterns = []
+        self.negative_stonith_patterns = []
 
-		self.executed = 0
+        self.executed = 0
 
-		rsc_classes = output_from_command("crm_resource --list-standards")
+        rsc_classes = output_from_command("crm_resource --list-standards")
 
-	def __new_cmd(self, cmd, args, exitcode, stdout_match = "", no_wait = 0, stdout_negative_match = "", kill=None):
-		self.cmds.append(
-			{
-				"cmd" : cmd,
-				"kill" : kill,
-				"args" : args,
-				"expected_exitcode" : exitcode,
-				"stdout_match" : stdout_match,
-				"stdout_negative_match" : stdout_negative_match,
-				"no_wait" : no_wait,
-			}
-		)
+    def __new_cmd(self, cmd, args, exitcode, stdout_match = "", no_wait = 0, stdout_negative_match = "", kill=None):
+        self.cmds.append(
+            {
+                "cmd" : cmd,
+                "kill" : kill,
+                "args" : args,
+                "expected_exitcode" : exitcode,
+                "stdout_match" : stdout_match,
+                "stdout_negative_match" : stdout_negative_match,
+                "no_wait" : no_wait,
+            }
+        )
 
-	def stop_pacemaker(self):
-		cmd = shlex.split("killall -9 -q pacemakerd")
-		test = subprocess.Popen(cmd, stdout=subprocess.PIPE)
-		test.wait()
+    def stop_pacemaker(self):
+        cmd = shlex.split("killall -9 -q pacemakerd")
+        test = subprocess.Popen(cmd, stdout=subprocess.PIPE)
+        test.wait()
 
-	def start_environment(self):
-		### make sure we are in full control here ###
-		self.stop_pacemaker()
+    def start_environment(self):
+        ### make sure we are in full control here ###
+        self.stop_pacemaker()
 
-		cmd = shlex.split("killall -9 -q stonithd")
-		test = subprocess.Popen(cmd, stdout=subprocess.PIPE)
-		test.wait()
+        cmd = shlex.split("killall -9 -q stonithd")
+        test = subprocess.Popen(cmd, stdout=subprocess.PIPE)
+        test.wait()
 
-		if self.verbose:
-			self.stonith_options = self.stonith_options + " -V"
-			print "Starting stonithd with %s" % self.stonith_options
+        if self.verbose:
+            self.stonith_options = self.stonith_options + " -V"
+            print "Starting stonithd with %s" % self.stonith_options
 
-		if os.path.exists("/tmp/stonith-regression.log"):
-			os.remove('/tmp/stonith-regression.log')
+        if os.path.exists("/tmp/stonith-regression.log"):
+            os.remove('/tmp/stonith-regression.log')
 
-		self.stonith_process = subprocess.Popen(
-			shlex.split("@CRM_DAEMON_DIR@/stonithd %s -l /tmp/stonith-regression.log" % self.stonith_options))
+        self.stonith_process = subprocess.Popen(
+            shlex.split("@CRM_DAEMON_DIR@/stonithd %s -l /tmp/stonith-regression.log" % self.stonith_options))
 
-		time.sleep(1)
-
-	def clean_environment(self):
-		if self.stonith_process:
-			self.stonith_process.terminate()
-			self.stonith_process.wait()
-
-		self.stonith_output = ""
-		self.stonith_process = None
-
-		f = open('/tmp/stonith-regression.log', 'r')
-		for line in f.readlines():
-			self.stonith_output = self.stonith_output + line
-
-		if self.verbose:
-			print "Daemon Output Start"
-			print self.stonith_output
-			print "Daemon Output End"
-		os.remove('/tmp/stonith-regression.log')
-
-	def add_stonith_log_pattern(self, pattern):
-		self.stonith_patterns.append(pattern)
-
-	def add_stonith_negative_log_pattern(self, pattern):
-		self.negative_stonith_patterns.append(pattern)
-
-	def add_cmd(self, cmd, args):
-		self.__new_cmd(cmd, args, 0, "")
-
-	def add_cmd_no_wait(self, cmd, args):
-		self.__new_cmd(cmd, args, 0, "", 1)
-
-	def add_cmd_check_stdout(self, cmd, args, match, no_match = ""):
-		self.__new_cmd(cmd, args, 0, match, 0, no_match)
-
-	def add_expected_fail_cmd(self, cmd, args, exitcode = 255):
-		self.__new_cmd(cmd, args, exitcode, "")
-
-	def get_exitcode(self):
-		return self.result_exitcode
-
-	def print_result(self, filler):
-		print "%s%s" % (filler, self.result_txt)
-
-	def run_cmd(self, args):
-		cmd = shlex.split(args['args'])
-		cmd.insert(0, args['cmd'])
-
-		if self.verbose:
-			print "\n\nRunning: "+" ".join(cmd)
-		test = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-
-		if args['kill']:
-			if self.verbose:
-				print "Also running: "+args['kill']
-			subprocess.Popen(shlex.split(args['kill']))
-
-		if args['no_wait'] == 0:
-			test.wait()
-		else:
-			return 0
-
-		output_res = test.communicate()
-		output = output_res[0] + output_res[1]
-
-		if self.verbose:
-			print output
-
-		if args['stdout_match'] != "" and output.count(args['stdout_match']) == 0:
-			test.returncode = -2
-			print "STDOUT string '%s' was not found in cmd output: %s" % (args['stdout_match'], output)
-
-		if args['stdout_negative_match'] != "" and output.count(args['stdout_negative_match']) != 0:
-			test.returncode = -2
-			print "STDOUT string '%s' was found in cmd output: %s" % (args['stdout_negative_match'], output)
-
-		return test.returncode;
-
-
-	def count_negative_matches(self, outline):
-		count = 0
-		for line in self.negative_stonith_patterns:
-			if outline.count(line):
-				count = 1
-				if self.verbose:
-					print "This pattern should not have matched = '%s" % (line)
-		return count
-
-	def match_stonith_patterns(self):
-		negative_matches = 0
-		cur = 0
-		pats = self.stonith_patterns
-		total_patterns = len(self.stonith_patterns)
-
-		if len(self.stonith_patterns) == 0:
-			return
-
-		for line in self.stonith_output.split("\n"):
-			negative_matches = negative_matches + self.count_negative_matches(line)
-			if len(pats) == 0:
-				continue
-			cur = -1
-			for p in pats:
-				cur = cur + 1
-				if line.count(pats[cur]):
-					del pats[cur]
-					break
-
-		if len(pats) > 0 or negative_matches:
-			if self.verbose:
-				for p in pats:
-					print "Pattern Not Matched = '%s'" % p
-
-			self.result_txt = "FAILURE - '%s' failed. %d patterns out of %d not matched. %d negative matches." % (self.name, len(pats), total_patterns, negative_matches)
-			self.result_exitcode = -1
-
-	def run(self):
-		res = 0
-		i = 1
-		self.start_environment()
-
-		if self.verbose:
-			print "\n--- START TEST - %s" % self.name
-
-		self.result_txt = "SUCCESS - '%s'" % (self.name)
-		self.result_exitcode = 0
-		for cmd in self.cmds:
-			res = self.run_cmd(cmd)
-			if res != cmd['expected_exitcode']:
-				print "Step %d FAILED - command returned %d, expected %d" % (i, res, cmd['expected_exitcode'])
-				self.result_txt = "FAILURE - '%s' failed at step %d. Command: %s %s" % (self.name, i, cmd['cmd'], cmd['args'])
-				self.result_exitcode = -1
-				break
-			else:
-				if self.verbose:
-					print "Step %d SUCCESS" % (i)
-			i = i + 1
-		self.clean_environment()
-
-		if self.result_exitcode == 0:
-			self.match_stonith_patterns()
-
-		print self.result_txt
-		if self.verbose:
-			print "--- END TEST - %s\n" % self.name
-
-		self.executed = 1
-		return res
+        time.sleep(1)
+
+    def clean_environment(self):
+        if self.stonith_process:
+            self.stonith_process.terminate()
+            self.stonith_process.wait()
+
+        self.stonith_output = ""
+        self.stonith_process = None
+
+        f = open('/tmp/stonith-regression.log', 'r')
+        for line in f.readlines():
+            self.stonith_output = self.stonith_output + line
+
+        if self.verbose:
+            print "Daemon Output Start"
+            print self.stonith_output
+            print "Daemon Output End"
+        os.remove('/tmp/stonith-regression.log')
+
+    def add_stonith_log_pattern(self, pattern):
+        self.stonith_patterns.append(pattern)
+
+    def add_stonith_negative_log_pattern(self, pattern):
+        self.negative_stonith_patterns.append(pattern)
+
+    def add_cmd(self, cmd, args):
+        self.__new_cmd(cmd, args, 0, "")
+
+    def add_cmd_no_wait(self, cmd, args):
+        self.__new_cmd(cmd, args, 0, "", 1)
+
+    def add_cmd_check_stdout(self, cmd, args, match, no_match = ""):
+        self.__new_cmd(cmd, args, 0, match, 0, no_match)
+
+    def add_expected_fail_cmd(self, cmd, args, exitcode = 255):
+        self.__new_cmd(cmd, args, exitcode, "")
+
+    def get_exitcode(self):
+        return self.result_exitcode
+
+    def print_result(self, filler):
+        print "%s%s" % (filler, self.result_txt)
+
+    def run_cmd(self, args):
+        cmd = shlex.split(args['args'])
+        cmd.insert(0, args['cmd'])
+
+        if self.verbose:
+            print "\n\nRunning: "+" ".join(cmd)
+        test = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+        if args['kill']:
+            if self.verbose:
+                print "Also running: "+args['kill']
+            subprocess.Popen(shlex.split(args['kill']))
+
+        if args['no_wait'] == 0:
+            test.wait()
+        else:
+            return 0
+
+        output_res = test.communicate()
+        output = output_res[0] + output_res[1]
+
+        if self.verbose:
+            print output
+
+        if args['stdout_match'] != "" and output.count(args['stdout_match']) == 0:
+            test.returncode = -2
+            print "STDOUT string '%s' was not found in cmd output: %s" % (args['stdout_match'], output)
+
+        if args['stdout_negative_match'] != "" and output.count(args['stdout_negative_match']) != 0:
+            test.returncode = -2
+            print "STDOUT string '%s' was found in cmd output: %s" % (args['stdout_negative_match'], output)
+
+        return test.returncode;
+
+
+    def count_negative_matches(self, outline):
+        count = 0
+        for line in self.negative_stonith_patterns:
+            if outline.count(line):
+                count = 1
+                if self.verbose:
+                    print "This pattern should not have matched = '%s" % (line)
+        return count
+
+    def match_stonith_patterns(self):
+        negative_matches = 0
+        cur = 0
+        pats = self.stonith_patterns
+        total_patterns = len(self.stonith_patterns)
+
+        if len(self.stonith_patterns) == 0:
+            return
+
+        for line in self.stonith_output.split("\n"):
+            negative_matches = negative_matches + self.count_negative_matches(line)
+            if len(pats) == 0:
+                continue
+            cur = -1
+            for p in pats:
+                cur = cur + 1
+                if line.count(pats[cur]):
+                    del pats[cur]
+                    break
+
+        if len(pats) > 0 or negative_matches:
+            if self.verbose:
+                for p in pats:
+                    print "Pattern Not Matched = '%s'" % p
+
+            self.result_txt = "FAILURE - '%s' failed. %d patterns out of %d not matched. %d negative matches." % (self.name, len(pats), total_patterns, negative_matches)
+            self.result_exitcode = -1
+
+    def run(self):
+        res = 0
+        i = 1
+        self.start_environment()
+
+        if self.verbose:
+            print "\n--- START TEST - %s" % self.name
+
+        self.result_txt = "SUCCESS - '%s'" % (self.name)
+        self.result_exitcode = 0
+        for cmd in self.cmds:
+            res = self.run_cmd(cmd)
+            if res != cmd['expected_exitcode']:
+                print "Step %d FAILED - command returned %d, expected %d" % (i, res, cmd['expected_exitcode'])
+                self.result_txt = "FAILURE - '%s' failed at step %d. Command: %s %s" % (self.name, i, cmd['cmd'], cmd['args'])
+                self.result_exitcode = -1
+                break
+            else:
+                if self.verbose:
+                    print "Step %d SUCCESS" % (i)
+            i = i + 1
+        self.clean_environment()
+
+        if self.result_exitcode == 0:
+            self.match_stonith_patterns()
+
+        print self.result_txt
+        if self.verbose:
+            print "--- END TEST - %s\n" % self.name
+
+        self.executed = 1
+        return res
 
 class Tests:
-	def __init__(self, verbose = 0):
-		self.tests = []
-		self.verbose = verbose
-		self.autogen_corosync_cfg = 0
-		if not os.path.exists("/etc/corosync/corosync.conf"):
-			self.autogen_corosync_cfg = 1
-
-	def new_test(self, name, description, with_cpg = 0):
-		test = Test(name, description, self.verbose, with_cpg)
-		self.tests.append(test)
-		return test
-
-	def print_list(self):
-		print "\n==== %d TESTS FOUND ====" % (len(self.tests))
-		print "%35s - %s" % ("TEST NAME", "TEST DESCRIPTION")
-		print "%35s - %s" % ("--------------------", "--------------------")
-		for test in self.tests:
-			print "%35s - %s" % (test.name, test.description)
-		print "==== END OF LIST ====\n"
-
-
-	def start_corosync(self):
-		if self.verbose:
-			print "Starting corosync"
-
-		test = subprocess.Popen("corosync", stdout=subprocess.PIPE)
-		test.wait()
-		time.sleep(10)
-
-	def stop_corosync(self):
-		cmd = shlex.split("killall -9 -q corosync")
-		test = subprocess.Popen(cmd, stdout=subprocess.PIPE)
-		test.wait()
-
-	def run_single(self, name):
-		for test in self.tests:
-			if test.name == name:
-				test.run()
-				break;
-
-	def run_tests_matching(self, pattern):
-		for test in self.tests:
-			if test.name.count(pattern) != 0:
-				test.run()
-
-	def run_cpg_only(self):
-		for test in self.tests:
-			if test.enable_corosync:
-				test.run()
-
-	def run_no_cpg(self):
-		for test in self.tests:
-			if not test.enable_corosync:
-				test.run()
-
-	def run_tests(self):
-		for test in self.tests:
-			test.run()
-
-	def exit(self):
-		for test in self.tests:
-			if test.executed == 0:
-				continue
-
-			if test.get_exitcode() != 0:
-				sys.exit(-1)
-
-		sys.exit(0)
-
-	def print_results(self):
-		failures = 0;
-		success = 0;
-		print "\n\n======= FINAL RESULTS =========="
-		print "\n--- FAILURE RESULTS:"
-		for test in self.tests:
-			if test.executed == 0:
-				continue
-
-			if test.get_exitcode() != 0:
-				failures = failures + 1
-				test.print_result("    ")
-			else:
-				success = success + 1
-
-		if failures == 0:
-			print "    None"
-
-		print "\n--- TOTALS\n    Pass:%d\n    Fail:%d\n" % (success, failures)
-	def build_api_sanity_tests(self):
-		verbose_arg = ""
-		if self.verbose:
-			verbose_arg = "-V"
-
-		test = self.new_test("standalone_low_level_api_test", "Sanity test client api in standalone mode.")
-		test.add_cmd("@CRM_DAEMON_DIR@/stonith-test", "-t %s" % (verbose_arg))
-
-		test = self.new_test("cpg_low_level_api_test", "Sanity test client api using mainloop and cpg.", 1)
-		test.add_cmd("@CRM_DAEMON_DIR@/stonith-test", "-m %s" % (verbose_arg))
-
-	def build_custom_timeout_tests(self):
-		# custom timeout without topology
-		test = self.new_test("cpg_custom_timeout_1",
-				"Verify per device timeouts work as expected without using topology.", 1)
-		test.add_cmd("stonith_admin", "-R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"")
-		test.add_cmd("stonith_admin", "-R true1  -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\" -o \"pcmk_off_timeout=1\"")
-		test.add_cmd("stonith_admin", "-R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3\" -o \"pcmk_off_timeout=4\"")
-		test.add_cmd("stonith_admin", "-F node3 -t 2")
-		# timeout is 2+1+4 = 7
-		test.add_stonith_log_pattern("remote op timeout set to 7")
-
-		# custom timeout _WITH_ topology
-		test = self.new_test("cpg_custom_timeout_2",
-				"Verify per device timeouts work as expected _WITH_ topology.", 1)
-		test.add_cmd("stonith_admin", "-R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"")
-		test.add_cmd("stonith_admin", "-R true1  -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\" -o \"pcmk_off_timeout=1\"")
-		test.add_cmd("stonith_admin", "-R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3\" -o \"pcmk_off_timeout=4000\"")
-		test.add_cmd("stonith_admin", "-r node3 -i 1 -v false1")
-		test.add_cmd("stonith_admin", "-r node3 -i 2 -v true1")
-		test.add_cmd("stonith_admin", "-r node3 -i 3 -v false2")
-		test.add_cmd("stonith_admin", "-F node3 -t 2")
-		# timeout is 2+1+4000 = 4003
-		test.add_stonith_log_pattern("remote op timeout set to 4003")
-
-	def build_fence_merge_tests(self):
-
-		### Simple test that overlapping fencing operations get merged
-		test = self.new_test("cpg_custom_merge_single",
-				"Verify overlapping identical fencing operations are merged, no fencing levels used.", 1)
-		test.add_cmd("stonith_admin", "-R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3\"")
-		test.add_cmd("stonith_admin", "-R true1  -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\" ")
-		test.add_cmd("stonith_admin", "-R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3\"")
-		test.add_cmd_no_wait("stonith_admin", "-F node3 -t 10")
-		test.add_cmd("stonith_admin", "-F node3 -t 10")
-		### one merger will happen
-		test.add_stonith_log_pattern("Merging stonith action off for node node3 originating from client")
-		### the pattern below signifies that both the original and duplicate operation completed
-		test.add_stonith_log_pattern("Operation off of node3 by")
-		test.add_stonith_log_pattern("Operation off of node3 by")
-
-		### Test that multiple mergers occur
-		test = self.new_test("cpg_custom_merge_multiple",
-				"Verify multiple overlapping identical fencing operations are merged", 1)
-		test.add_cmd("stonith_admin", "-R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3\"")
-		test.add_cmd("stonith_admin", "-R true1  -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\" ")
-		test.add_cmd("stonith_admin", "-R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3\"")
-		test.add_cmd_no_wait("stonith_admin", "-F node3 -t 10")
-		test.add_cmd_no_wait("stonith_admin", "-F node3 -t 10")
-		test.add_cmd_no_wait("stonith_admin", "-F node3 -t 10")
-		test.add_cmd_no_wait("stonith_admin", "-F node3 -t 10")
-		test.add_cmd("stonith_admin", "-F node3 -t 10")
-		### 4 mergers should occur
-		test.add_stonith_log_pattern("Merging stonith action off for node node3 originating from client")
-		test.add_stonith_log_pattern("Merging stonith action off for node node3 originating from client")
-		test.add_stonith_log_pattern("Merging stonith action off for node node3 originating from client")
-		test.add_stonith_log_pattern("Merging stonith action off for node node3 originating from client")
-		### the pattern below signifies that both the original and duplicate operation completed
-		test.add_stonith_log_pattern("Operation off of node3 by")
-		test.add_stonith_log_pattern("Operation off of node3 by")
-		test.add_stonith_log_pattern("Operation off of node3 by")
-		test.add_stonith_log_pattern("Operation off of node3 by")
-		test.add_stonith_log_pattern("Operation off of node3 by")
-
-		### Test that multiple mergers occur with topologies used
-		test = self.new_test("cpg_custom_merge_with_topology",
-				"Verify multiple overlapping identical fencing operations are merged with fencing levels.", 1)
-		test.add_cmd("stonith_admin", "-R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3\"")
-		test.add_cmd("stonith_admin", "-R true1  -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\" ")
-		test.add_cmd("stonith_admin", "-R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3\"")
-		test.add_cmd("stonith_admin", "-r node3 -i 1 -v false1")
-		test.add_cmd("stonith_admin", "-r node3 -i 1 -v false2")
-		test.add_cmd("stonith_admin", "-r node3 -i 2 -v true1")
-		test.add_cmd_no_wait("stonith_admin", "-F node3 -t 10")
-		test.add_cmd_no_wait("stonith_admin", "-F node3 -t 10")
-		test.add_cmd_no_wait("stonith_admin", "-F node3 -t 10")
-		test.add_cmd_no_wait("stonith_admin", "-F node3 -t 10")
-		test.add_cmd("stonith_admin", "-F node3 -t 10")
-		### 4 mergers should occur
-		test.add_stonith_log_pattern("Merging stonith action off for node node3 originating from client")
-		test.add_stonith_log_pattern("Merging stonith action off for node node3 originating from client")
-		test.add_stonith_log_pattern("Merging stonith action off for node node3 originating from client")
-		test.add_stonith_log_pattern("Merging stonith action off for node node3 originating from client")
-		### the pattern below signifies that both the original and duplicate operation completed
-		test.add_stonith_log_pattern("Operation off of node3 by")
-		test.add_stonith_log_pattern("Operation off of node3 by")
-		test.add_stonith_log_pattern("Operation off of node3 by")
-		test.add_stonith_log_pattern("Operation off of node3 by")
-		test.add_stonith_log_pattern("Operation off of node3 by")
-
-
-		test = self.new_test("cpg_custom_no_merge",
-				"Verify differing fencing operations are not merged", 1)
-		test.add_cmd("stonith_admin", "-R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3 node2\"")
-		test.add_cmd("stonith_admin", "-R true1  -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3 node2\" ")
-		test.add_cmd("stonith_admin", "-R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3 node2\"")
-		test.add_cmd("stonith_admin", "-r node3 -i 1 -v false1")
-		test.add_cmd("stonith_admin", "-r node3 -i 1 -v false2")
-		test.add_cmd("stonith_admin", "-r node3 -i 2 -v true1")
-		test.add_cmd_no_wait("stonith_admin", "-F node2 -t 10")
-		test.add_cmd("stonith_admin", "-F node3 -t 10")
-		test.add_stonith_negative_log_pattern("Merging stonith action off for node node3 originating from client")
-
-	def build_standalone_tests(self):
-		test_types = [
-			{
-				"prefix" : "standalone" ,
-				"use_cpg" : 0,
-			},
-			{
-				"prefix" : "cpg" ,
-				"use_cpg" : 1,
-			},
-		]
-
-		# test what happens when all devices timeout
-		for test_type in test_types:
-			test = self.new_test("%s_fence_multi_device_failure" % test_type["prefix"],
-					"Verify that all devices timeout, a fencing failure is returned.", test_type["use_cpg"])
-			test.add_cmd("stonith_admin", "-R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"")
-			test.add_cmd("stonith_admin", "-R false2  -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"")
-			test.add_cmd("stonith_admin", "-R false3 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"")
-			if test_type["use_cpg"] == 1:
-				test.add_expected_fail_cmd("stonith_admin", "-F node3 -t 2", 194)
-				test.add_stonith_log_pattern("remote op timeout set to 6")
-			else:
-				test.add_expected_fail_cmd("stonith_admin", "-F node3 -t 2", 55)
-
-			test.add_stonith_log_pattern("for host 'node3' with device 'false1' returned: ")
-			test.add_stonith_log_pattern("for host 'node3' with device 'false2' returned: ")
-			test.add_stonith_log_pattern("for host 'node3' with device 'false3' returned: ")
-
-		# test what happens when multiple devices can fence a node, but the first device fails.
-		for test_type in test_types:
-			test = self.new_test("%s_fence_device_failure_rollover" % test_type["prefix"],
-					"Verify that when one fence device fails for a node, the others are tried.", test_type["use_cpg"])
-			test.add_cmd("stonith_admin", "-R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"")
-			test.add_cmd("stonith_admin", "-R true1  -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
-			test.add_cmd("stonith_admin", "-R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"")
-			test.add_cmd("stonith_admin", "-F node3 -t 2")
-
-			if test_type["use_cpg"] == 1:
-				test.add_stonith_log_pattern("remote op timeout set to 6")
-
-		# simple topology test for one device
-		for test_type in test_types:
-			if test_type["use_cpg"] == 0:
-				continue
-
-			test = self.new_test("%s_topology_simple" % test_type["prefix"],
-					"Verify all fencing devices at a level are used.", test_type["use_cpg"])
-			test.add_cmd("stonith_admin", "-R true  -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
-
-			test.add_cmd("stonith_admin", "-r node3 -i 1 -v true")
-			test.add_cmd("stonith_admin", "-F node3 -t 2")
-
-			test.add_stonith_log_pattern("remote op timeout set to 2")
-			test.add_stonith_log_pattern("for host 'node3' with device 'true' returned: 0")
-
-
-		# add topology, delete topology, verify fencing still works 
-		for test_type in test_types:
-			if test_type["use_cpg"] == 0:
-				continue
-
-			test = self.new_test("%s_topology_add_remove" % test_type["prefix"],
-					"Verify fencing occurrs after all topology levels are removed", test_type["use_cpg"])
-			test.add_cmd("stonith_admin", "-R true  -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
-
-			test.add_cmd("stonith_admin", "-r node3 -i 1 -v true")
-			test.add_cmd("stonith_admin", "-d node3 -i 1")
-			test.add_cmd("stonith_admin", "-F node3 -t 2")
-
-			test.add_stonith_log_pattern("remote op timeout set to 2")
-			test.add_stonith_log_pattern("for host 'node3' with device 'true' returned: 0")
-
-		# test what happens when the first fencing level has multiple devices.
-		for test_type in test_types:
-			if test_type["use_cpg"] == 0:
-				continue
-
-			test = self.new_test("%s_topology_device_fails" % test_type["prefix"],
-					"Verify if one device in a level fails, the other is tried.", test_type["use_cpg"])
-			test.add_cmd("stonith_admin", "-R false  -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"")
-			test.add_cmd("stonith_admin", "-R true  -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
-
-			test.add_cmd("stonith_admin", "-r node3 -i 1 -v false")
-			test.add_cmd("stonith_admin", "-r node3 -i 2 -v true")
-			test.add_cmd("stonith_admin", "-F node3 -t 20")
-
-			test.add_stonith_log_pattern("remote op timeout set to 40")
-			test.add_stonith_log_pattern("for host 'node3' with device 'false' returned: -201")
-			test.add_stonith_log_pattern("for host 'node3' with device 'true' returned: 0")
-
-		# test what happens when the first fencing level fails.
-		for test_type in test_types:
-			if test_type["use_cpg"] == 0:
-				continue
-
-			test = self.new_test("%s_topology_multi_level_fails" % test_type["prefix"],
-					"Verify if one level fails, the next leve is tried.", test_type["use_cpg"])
-			test.add_cmd("stonith_admin", "-R true1  -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
-			test.add_cmd("stonith_admin", "-R true2  -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
-			test.add_cmd("stonith_admin", "-R true3  -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
-			test.add_cmd("stonith_admin", "-R true4  -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
-			test.add_cmd("stonith_admin", "-R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"")
-			test.add_cmd("stonith_admin", "-R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"")
-
-			test.add_cmd("stonith_admin", "-r node3 -i 1 -v false1")
-			test.add_cmd("stonith_admin", "-r node3 -i 1 -v true1")
-			test.add_cmd("stonith_admin", "-r node3 -i 2 -v true2")
-			test.add_cmd("stonith_admin", "-r node3 -i 2 -v false2")
-			test.add_cmd("stonith_admin", "-r node3 -i 3 -v true3")
-			test.add_cmd("stonith_admin", "-r node3 -i 3 -v true4")
-
-			test.add_cmd("stonith_admin", "-F node3 -t 2")
-
-			test.add_stonith_log_pattern("remote op timeout set to 12")
-			test.add_stonith_log_pattern("for host 'node3' with device 'false1' returned: -201")
-			test.add_stonith_log_pattern("for host 'node3' with device 'false2' returned: -201")
-			test.add_stonith_log_pattern("for host 'node3' with device 'true3' returned: 0")
-			test.add_stonith_log_pattern("for host 'node3' with device 'true4' returned: 0")
-
-
-		# test what happens when the first fencing level had devices that no one has registered
-		for test_type in test_types:
-			if test_type["use_cpg"] == 0:
-				continue
-
-			test = self.new_test("%s_topology_missing_devices" % test_type["prefix"],
-					"Verify topology can continue with missing devices.", test_type["use_cpg"])
-			test.add_cmd("stonith_admin", "-R true2  -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
-			test.add_cmd("stonith_admin", "-R true3  -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
-			test.add_cmd("stonith_admin", "-R true4  -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
-			test.add_cmd("stonith_admin", "-R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"")
-
-			test.add_cmd("stonith_admin", "-r node3 -i 1 -v false1")
-			test.add_cmd("stonith_admin", "-r node3 -i 1 -v true1")
-			test.add_cmd("stonith_admin", "-r node3 -i 2 -v true2")
-			test.add_cmd("stonith_admin", "-r node3 -i 2 -v false2")
-			test.add_cmd("stonith_admin", "-r node3 -i 3 -v true3")
-			test.add_cmd("stonith_admin", "-r node3 -i 3 -v true4")
-
-			test.add_cmd("stonith_admin", "-F node3 -t 2")
-
-		# Test what happens if multiple fencing levels are defined, and then the first one is removed.
-		for test_type in test_types:
-			if test_type["use_cpg"] == 0:
-				continue
-
-			test = self.new_test("%s_topology_level_removal" % test_type["prefix"],
-					"Verify level removal works.", test_type["use_cpg"])
-			test.add_cmd("stonith_admin", "-R true1  -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
-			test.add_cmd("stonith_admin", "-R true2  -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
-			test.add_cmd("stonith_admin", "-R true3  -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
-			test.add_cmd("stonith_admin", "-R true4  -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
-			test.add_cmd("stonith_admin", "-R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"")
-			test.add_cmd("stonith_admin", "-R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"")
-
-			test.add_cmd("stonith_admin", "-r node3 -i 1 -v false1")
-			test.add_cmd("stonith_admin", "-r node3 -i 1 -v true1")
-
-			test.add_cmd("stonith_admin", "-r node3 -i 2 -v true2")
-			test.add_cmd("stonith_admin", "-r node3 -i 2 -v false2")
-
-			test.add_cmd("stonith_admin", "-r node3 -i 3 -v true3")
-			test.add_cmd("stonith_admin", "-r node3 -i 3 -v true4")
-
-			# Now remove level 2, verify none of the devices in level two are hit.
-			test.add_cmd("stonith_admin", "-d node3 -i 2")
-
-			test.add_cmd("stonith_admin", "-F node3 -t 20")
-
-			test.add_stonith_log_pattern("remote op timeout set to 8")
-			test.add_stonith_log_pattern("for host 'node3' with device 'false1' returned: -201")
-			test.add_stonith_negative_log_pattern("for host 'node3' with device 'false2' returned: ")
-			test.add_stonith_log_pattern("for host 'node3' with device 'true3' returned: 0")
-			test.add_stonith_log_pattern("for host 'node3' with device 'true4' returned: 0")
-
-		# test the stonith builds the correct list of devices that can fence a node.
-		for test_type in test_types:
-			test = self.new_test("%s_list_devices" % test_type["prefix"],
-					"Verify list of devices that can fence a node is correct", test_type["use_cpg"])
-			test.add_cmd("stonith_admin", "-R true1  -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\"")
-			test.add_cmd("stonith_admin", "-R true2 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
-			test.add_cmd("stonith_admin", "-R true3 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
-
-			test.add_cmd_check_stdout("stonith_admin", "-l node1 -V", "true2", "true1")
-			test.add_cmd_check_stdout("stonith_admin", "-l node1 -V", "true3", "true1")
-
-		# simple test of device monitor
-		for test_type in test_types:
-			test = self.new_test("%s_monitor" % test_type["prefix"],
-					"Verify device is reachable", test_type["use_cpg"])
-			test.add_cmd("stonith_admin", "-R true1  -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\"")
-			test.add_cmd("stonith_admin", "-R false1  -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3\"")
-
-			test.add_cmd("stonith_admin", "-Q true1")
-			test.add_cmd("stonith_admin", "-Q false1")
-			test.add_expected_fail_cmd("stonith_admin", "-Q true2", 237)
-
-		# Verify monitor occurs for duration of timeout period on failure
-		for test_type in test_types:
-			test = self.new_test("%s_monitor_timeout" % test_type["prefix"],
-					"Verify monitor uses duration of timeout period given.", test_type["use_cpg"])
-			test.add_cmd("stonith_admin", "-R true1  -a fence_dummy_monitor_fail -o \"pcmk_host_list=node3\"")
-			test.add_expected_fail_cmd("stonith_admin", "-Q true1 -t 5", 195)
-			test.add_stonith_log_pattern("Attempt 2 to execute")
-
-		# Verify monitor occurs for duration of timeout period on failure, but stops at max retries
-		for test_type in test_types:
-			test = self.new_test("%s_monitor_timeout_max_retries" % test_type["prefix"],
-					"Verify monitor retries until max retry value or timeout is hit.", test_type["use_cpg"])
-			test.add_cmd("stonith_admin", "-R true1  -a fence_dummy_monitor_fail -o \"pcmk_host_list=node3\"")
-			test.add_expected_fail_cmd("stonith_admin", "-Q true1 -t 15",195)
-			test.add_stonith_log_pattern("Attempted to execute agent fence_dummy_monitor_fail (list) the maximum number of times")
-
-		# simple register test
-		for test_type in test_types:
-			test = self.new_test("%s_register" % test_type["prefix"],
-					"Verify devices can be registered and un-registered", test_type["use_cpg"])
-			test.add_cmd("stonith_admin", "-R true1  -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\"")
-
-			test.add_cmd("stonith_admin", "-Q true1")
-
-			test.add_cmd("stonith_admin", "-D true1")
-
-			test.add_expected_fail_cmd("stonith_admin", "-Q true1", 237)
-
-
-		# simple reboot test
-		for test_type in test_types:
-			test = self.new_test("%s_reboot" % test_type["prefix"],
-					"Verify devices can be rebooted", test_type["use_cpg"])
-			test.add_cmd("stonith_admin", "-R true1  -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\"")
-
-			test.add_cmd("stonith_admin", "-B node3 -t 2")
-
-			test.add_cmd("stonith_admin", "-D true1")
-
-			test.add_expected_fail_cmd("stonith_admin", "-Q true1", 237)
-
-		# test fencing history.
-		for test_type in test_types:
-			if test_type["use_cpg"] == 0:
-				continue
-			test = self.new_test("%s_fence_history" % test_type["prefix"],
-					"Verify last fencing operation is returned.", test_type["use_cpg"])
-			test.add_cmd("stonith_admin", "-R true1  -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\"")
-
-			test.add_cmd("stonith_admin", "-F node3 -t 2 -V")
-
-			test.add_cmd_check_stdout("stonith_admin", "-H node3", "was able to turn off node node3", "")
-
-		# simple test of dynamic list query
-		for test_type in test_types:
-			test = self.new_test("%s_dynamic_list_query" % test_type["prefix"],
-					"Verify dynamic list of fencing devices can be retrieved.", test_type["use_cpg"])
-			test.add_cmd("stonith_admin", "-R true1  -a fence_dummy_list")
-			test.add_cmd("stonith_admin", "-R true2  -a fence_dummy_list")
-			test.add_cmd("stonith_admin", "-R true3  -a fence_dummy_list")
-
-			test.add_cmd_check_stdout("stonith_admin", "-l fake_port_1", "3 devices found")
-
-
-		# fence using dynamic list query
-		for test_type in test_types:
-			test = self.new_test("%s_fence_dynamic_list_query" % test_type["prefix"],
-					"Verify dynamic list of fencing devices can be retrieved.", test_type["use_cpg"])
-			test.add_cmd("stonith_admin", "-R true1  -a fence_dummy_list")
-			test.add_cmd("stonith_admin", "-R true2  -a fence_dummy_list")
-			test.add_cmd("stonith_admin", "-R true3  -a fence_dummy_list")
-
-			test.add_cmd("stonith_admin", "-F fake_port_1 -t 5 -V");
-
-		# simple test of  query using status action
-		for test_type in test_types:
-			test = self.new_test("%s_status_query" % test_type["prefix"],
-					"Verify dynamic list of fencing devices can be retrieved.", test_type["use_cpg"])
-			test.add_cmd("stonith_admin", "-R true1  -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_check=status\"")
-			test.add_cmd("stonith_admin", "-R true2  -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_check=status\"")
-			test.add_cmd("stonith_admin", "-R true3  -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_check=status\"")
-
-			test.add_cmd_check_stdout("stonith_admin", "-l fake_port_1", "3 devices found")
-
-		# test what happens when no reboot action is advertised
-		for test_type in test_types:
-			test = self.new_test("%s_no_reboot_support" % test_type["prefix"],
-					"Verify reboot action defaults to off when no reboot action is advertised by agent.", test_type["use_cpg"])
-			test.add_cmd("stonith_admin", "-R true1 -a fence_dummy_no_reboot -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
-			test.add_cmd("stonith_admin", "-B node1 -t 5 -V");
-			test.add_stonith_log_pattern("does not advertise support for 'reboot', performing 'off'")
-			test.add_stonith_log_pattern("with device 'true1' returned: 0 (OK)");
-
-		# make sure reboot is used when reboot action is advertised
-		for test_type in test_types:
-			test = self.new_test("%s_with_reboot_support" % test_type["prefix"],
-					"Verify reboot action can be used when metadata advertises it.", test_type["use_cpg"])
-			test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
-			test.add_cmd("stonith_admin", "-B node1 -t 5 -V");
-			test.add_stonith_negative_log_pattern("does not advertise support for 'reboot', performing 'off'")
-			test.add_stonith_log_pattern("with device 'true1' returned: 0 (OK)");
-
-	def build_nodeid_tests(self):
-		our_uname = output_from_command("uname -n")
-		if our_uname:
-			our_uname = our_uname[0]
-
-		### verify nodeid is supplied when nodeid is in the metadata parameters
-		test = self.new_test("cpg_supply_nodeid",
-				"Verify nodeid is given when fence agent has nodeid as parameter", 1)
-
-		test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s\"" % (our_uname))
-		test.add_cmd("stonith_admin", "-F %s -t 3" % (our_uname))
-		test.add_stonith_log_pattern("For stonith action (off) for victim %s, adding nodeid" % (our_uname))
-
-		### verify nodeid is _NOT_ supplied when nodeid is not in the metadata parameters
-		test = self.new_test("cpg_do_not_supply_nodeid",
-				"Verify nodeid is _NOT_ given when fence agent does not have nodeid as parameter", 1)
-
-		test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s\"" % (our_uname))
-		test.add_cmd("stonith_admin", "-F %s -t 3" % (our_uname))
-		test.add_stonith_negative_log_pattern("For stonith action (off) for victim %s, adding nodeid" % (our_uname))
-
-		### verify nodeid use doesn't explode standalone mode
-		test = self.new_test("standalone_do_not_supply_nodeid",
-				"Verify nodeid in metadata parameter list doesn't kill standalone mode", 0)
-
-		test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s\"" % (our_uname))
-		test.add_cmd("stonith_admin", "-F %s -t 3" % (our_uname))
-		test.add_stonith_negative_log_pattern("For stonith action (off) for victim %s, adding nodeid" % (our_uname))
-
-
-	def build_unfence_tests(self):
-		our_uname = output_from_command("uname -n")
-		if our_uname:
-			our_uname = our_uname[0]
-
-		### verify unfencing using automatic unfencing
-		test = self.new_test("cpg_unfence_required_1",
-				"Verify require unfencing on all devices when automatic=true in agent's metadata", 1)
-		test.add_cmd("stonith_admin", "-R true1 -a fence_dummy_automatic_unfence -o \"mode=pass\" -o \"pcmk_host_list=%s\"" % (our_uname))
-		test.add_cmd("stonith_admin", "-R true2 -a fence_dummy_automatic_unfence -o \"mode=pass\" -o \"pcmk_host_list=%s\"" % (our_uname))
-		test.add_cmd("stonith_admin", "-U %s -t 3" % (our_uname))
-		# both devices should be executed
-		test.add_stonith_log_pattern("with device 'true1' returned: 0 (OK)");
-		test.add_stonith_log_pattern("with device 'true2' returned: 0 (OK)");
-
-
-		### verify unfencing using automatic unfencing fails if any of the required agents fail
-		test = self.new_test("cpg_unfence_required_2",
-				"Verify require unfencing on all devices when automatic=true in agent's metadata", 1)
-		test.add_cmd("stonith_admin", "-R true1 -a fence_dummy_automatic_unfence -o \"mode=pass\" -o \"pcmk_host_list=%s\"" % (our_uname))
-		test.add_cmd("stonith_admin", "-R true2 -a fence_dummy_automatic_unfence -o \"mode=fail\" -o \"pcmk_host_list=%s\"" % (our_uname))
-		test.add_expected_fail_cmd("stonith_admin", "-U %s -t 6" % (our_uname), 143)
-
-		### verify unfencing using automatic devices with topology
-		test = self.new_test("cpg_unfence_required_3",
-				"Verify require unfencing on all devices even when required devices are at different topology levels", 1)
-		test.add_cmd("stonith_admin", "-R true1 -a fence_dummy_automatic_unfence -o \"mode=pass\" -o \"pcmk_host_list=%s node3\"" % (our_uname))
-		test.add_cmd("stonith_admin", "-R true2 -a fence_dummy_automatic_unfence -o \"mode=pass\" -o \"pcmk_host_list=%s node3\"" % (our_uname))
-		test.add_cmd("stonith_admin", "-r %s -i 1 -v true1" % (our_uname))
-		test.add_cmd("stonith_admin", "-r %s -i 2 -v true2" % (our_uname))
-		test.add_cmd("stonith_admin", "-U %s -t 3" % (our_uname))
-		test.add_stonith_log_pattern("with device 'true1' returned: 0 (OK)");
-		test.add_stonith_log_pattern("with device 'true2' returned: 0 (OK)");
-
-
-		### verify unfencing using automatic devices with topology
-		test = self.new_test("cpg_unfence_required_4",
-				"Verify all required devices are executed even with topology levels fail.", 1)
-		test.add_cmd("stonith_admin", "-R true1 -a fence_dummy_automatic_unfence -o \"mode=pass\" -o \"pcmk_host_list=%s node3\"" % (our_uname))
-		test.add_cmd("stonith_admin", "-R true2 -a fence_dummy_automatic_unfence -o \"mode=pass\" -o \"pcmk_host_list=%s node3\"" % (our_uname))
-		test.add_cmd("stonith_admin", "-R true3 -a fence_dummy_automatic_unfence -o \"mode=pass\" -o \"pcmk_host_list=%s node3\"" % (our_uname))
-		test.add_cmd("stonith_admin", "-R true4 -a fence_dummy_automatic_unfence -o \"mode=pass\" -o \"pcmk_host_list=%s node3\"" % (our_uname))
-		test.add_cmd("stonith_admin", "-R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=%s node3\"" % (our_uname))
-		test.add_cmd("stonith_admin", "-R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=%s node3\"" % (our_uname))
-		test.add_cmd("stonith_admin", "-R false3 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=%s node3\"" % (our_uname))
-		test.add_cmd("stonith_admin", "-R false4 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=%s node3\"" % (our_uname))
-		test.add_cmd("stonith_admin", "-r %s -i 1 -v true1" % (our_uname))
-		test.add_cmd("stonith_admin", "-r %s -i 1 -v false1" % (our_uname))
-		test.add_cmd("stonith_admin", "-r %s -i 2 -v false2" % (our_uname))
-		test.add_cmd("stonith_admin", "-r %s -i 2 -v true2" % (our_uname))
-		test.add_cmd("stonith_admin", "-r %s -i 2 -v false3" % (our_uname))
-		test.add_cmd("stonith_admin", "-r %s -i 2 -v true3" % (our_uname))
-		test.add_cmd("stonith_admin", "-r %s -i 3 -v false4" % (our_uname))
-		test.add_cmd("stonith_admin", "-r %s -i 4 -v true4" % (our_uname))
-		test.add_cmd("stonith_admin", "-U %s -t 3" % (our_uname))
-		test.add_stonith_log_pattern("with device 'true1' returned: 0 (OK)");
-		test.add_stonith_log_pattern("with device 'true2' returned: 0 (OK)");
-		test.add_stonith_log_pattern("with device 'true3' returned: 0 (OK)");
-		test.add_stonith_log_pattern("with device 'true4' returned: 0 (OK)");
-
-		### verify unfencing using on_target device
-		test = self.new_test("cpg_unfence_on_target_1",
-				"Verify unfencing with on_target = true", 1)
-		test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s\"" % (our_uname))
-		test.add_cmd("stonith_admin", "-U %s -t 3" % (our_uname))
-		test.add_stonith_log_pattern("(on) to be executed on the target node")
-
-
-		### verify failure of unfencing using on_target device
-		test = self.new_test("cpg_unfence_on_target_2",
-				"Verify failure unfencing with on_target = true", 1)
-		test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s node_fake_1234\"" % (our_uname))
-		test.add_expected_fail_cmd("stonith_admin", "-U node_fake_1234 -t 3", 237)
-		test.add_stonith_log_pattern("(on) to be executed on the target node")
-
-
-		### verify unfencing using on_target device with topology
-		test = self.new_test("cpg_unfence_on_target_3",
-				"Verify unfencing with on_target = true using topology", 1)
-
-		test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s node3\"" % (our_uname))
-		test.add_cmd("stonith_admin", "-R true2 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s node3\"" % (our_uname))
-
-		test.add_cmd("stonith_admin", "-r %s -i 1 -v true1" % (our_uname))
-		test.add_cmd("stonith_admin", "-r %s -i 2 -v true2" % (our_uname))
-
-		test.add_cmd("stonith_admin", "-U %s -t 3" % (our_uname))
-		test.add_stonith_log_pattern("(on) to be executed on the target node")
-
-		### verify unfencing using on_target device with topology fails when victim node doesn't exist
-		test = self.new_test("cpg_unfence_on_target_4",
-				"Verify unfencing failure with on_target = true using topology", 1)
-
-		test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s node_fake\"" % (our_uname))
-		test.add_cmd("stonith_admin", "-R true2 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s node_fake\"" % (our_uname))
-
-		test.add_cmd("stonith_admin", "-r node_fake -i 1 -v true1")
-		test.add_cmd("stonith_admin", "-r node_fake -i 2 -v true2")
-
-		test.add_expected_fail_cmd("stonith_admin", "-U node_fake -t 3", 237)
-		test.add_stonith_log_pattern("(on) to be executed on the target node")
-
-
-	def setup_environment(self, use_corosync):
-		if self.autogen_corosync_cfg and use_corosync:
-			corosync_conf = ("""
+    def __init__(self, verbose = 0):
+        self.tests = []
+        self.verbose = verbose
+        self.autogen_corosync_cfg = 0
+        if not os.path.exists("/etc/corosync/corosync.conf"):
+            self.autogen_corosync_cfg = 1
+
+    def new_test(self, name, description, with_cpg = 0):
+        test = Test(name, description, self.verbose, with_cpg)
+        self.tests.append(test)
+        return test
+
+    def print_list(self):
+        print "\n==== %d TESTS FOUND ====" % (len(self.tests))
+        print "%35s - %s" % ("TEST NAME", "TEST DESCRIPTION")
+        print "%35s - %s" % ("--------------------", "--------------------")
+        for test in self.tests:
+            print "%35s - %s" % (test.name, test.description)
+        print "==== END OF LIST ====\n"
+
+
+    def start_corosync(self):
+        if self.verbose:
+            print "Starting corosync"
+
+        test = subprocess.Popen("corosync", stdout=subprocess.PIPE)
+        test.wait()
+        time.sleep(10)
+
+    def stop_corosync(self):
+        cmd = shlex.split("killall -9 -q corosync")
+        test = subprocess.Popen(cmd, stdout=subprocess.PIPE)
+        test.wait()
+
+    def run_single(self, name):
+        for test in self.tests:
+            if test.name == name:
+                test.run()
+                break;
+
+    def run_tests_matching(self, pattern):
+        for test in self.tests:
+            if test.name.count(pattern) != 0:
+                test.run()
+
+    def run_cpg_only(self):
+        for test in self.tests:
+            if test.enable_corosync:
+                test.run()
+
+    def run_no_cpg(self):
+        for test in self.tests:
+            if not test.enable_corosync:
+                test.run()
+
+    def run_tests(self):
+        for test in self.tests:
+            test.run()
+
+    def exit(self):
+        for test in self.tests:
+            if test.executed == 0:
+                continue
+
+            if test.get_exitcode() != 0:
+                sys.exit(-1)
+
+        sys.exit(0)
+
+    def print_results(self):
+        failures = 0;
+        success = 0;
+        print "\n\n======= FINAL RESULTS =========="
+        print "\n--- FAILURE RESULTS:"
+        for test in self.tests:
+            if test.executed == 0:
+                continue
+
+            if test.get_exitcode() != 0:
+                failures = failures + 1
+                test.print_result("    ")
+            else:
+                success = success + 1
+
+        if failures == 0:
+            print "    None"
+
+        print "\n--- TOTALS\n    Pass:%d\n    Fail:%d\n" % (success, failures)
+    def build_api_sanity_tests(self):
+        verbose_arg = ""
+        if self.verbose:
+            verbose_arg = "-V"
+
+        test = self.new_test("standalone_low_level_api_test", "Sanity test client api in standalone mode.")
+        test.add_cmd("@CRM_DAEMON_DIR@/stonith-test", "-t %s" % (verbose_arg))
+
+        test = self.new_test("cpg_low_level_api_test", "Sanity test client api using mainloop and cpg.", 1)
+        test.add_cmd("@CRM_DAEMON_DIR@/stonith-test", "-m %s" % (verbose_arg))
+
+    def build_custom_timeout_tests(self):
+        # custom timeout without topology
+        test = self.new_test("cpg_custom_timeout_1",
+                "Verify per device timeouts work as expected without using topology.", 1)
+        test.add_cmd("stonith_admin", "-R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"")
+        test.add_cmd("stonith_admin", "-R true1  -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\" -o \"pcmk_off_timeout=1\"")
+        test.add_cmd("stonith_admin", "-R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3\" -o \"pcmk_off_timeout=4\"")
+        test.add_cmd("stonith_admin", "-F node3 -t 2")
+        # timeout is 2+1+4 = 7
+        test.add_stonith_log_pattern("remote op timeout set to 7")
+
+        # custom timeout _WITH_ topology
+        test = self.new_test("cpg_custom_timeout_2",
+                "Verify per device timeouts work as expected _WITH_ topology.", 1)
+        test.add_cmd("stonith_admin", "-R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"")
+        test.add_cmd("stonith_admin", "-R true1  -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\" -o \"pcmk_off_timeout=1\"")
+        test.add_cmd("stonith_admin", "-R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3\" -o \"pcmk_off_timeout=4000\"")
+        test.add_cmd("stonith_admin", "-r node3 -i 1 -v false1")
+        test.add_cmd("stonith_admin", "-r node3 -i 2 -v true1")
+        test.add_cmd("stonith_admin", "-r node3 -i 3 -v false2")
+        test.add_cmd("stonith_admin", "-F node3 -t 2")
+        # timeout is 2+1+4000 = 4003
+        test.add_stonith_log_pattern("remote op timeout set to 4003")
+
+    def build_fence_merge_tests(self):
+
+        ### Simple test that overlapping fencing operations get merged
+        test = self.new_test("cpg_custom_merge_single",
+                "Verify overlapping identical fencing operations are merged, no fencing levels used.", 1)
+        test.add_cmd("stonith_admin", "-R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3\"")
+        test.add_cmd("stonith_admin", "-R true1  -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\" ")
+        test.add_cmd("stonith_admin", "-R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3\"")
+        test.add_cmd_no_wait("stonith_admin", "-F node3 -t 10")
+        test.add_cmd("stonith_admin", "-F node3 -t 10")
+        ### one merger will happen
+        test.add_stonith_log_pattern("Merging stonith action off for node node3 originating from client")
+        ### the pattern below signifies that both the original and duplicate operation completed
+        test.add_stonith_log_pattern("Operation off of node3 by")
+        test.add_stonith_log_pattern("Operation off of node3 by")
+
+        ### Test that multiple mergers occur
+        test = self.new_test("cpg_custom_merge_multiple",
+                "Verify multiple overlapping identical fencing operations are merged", 1)
+        test.add_cmd("stonith_admin", "-R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3\"")
+        test.add_cmd("stonith_admin", "-R true1  -a fence_dummy -o \"mode=pass\" -o \"delay=2\" -o \"pcmk_host_list=node3\" ")
+        test.add_cmd("stonith_admin", "-R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3\"")
+        test.add_cmd_no_wait("stonith_admin", "-F node3 -t 10")
+        test.add_cmd_no_wait("stonith_admin", "-F node3 -t 10")
+        test.add_cmd_no_wait("stonith_admin", "-F node3 -t 10")
+        test.add_cmd_no_wait("stonith_admin", "-F node3 -t 10")
+        test.add_cmd("stonith_admin", "-F node3 -t 10")
+        ### 4 mergers should occur
+        test.add_stonith_log_pattern("Merging stonith action off for node node3 originating from client")
+        test.add_stonith_log_pattern("Merging stonith action off for node node3 originating from client")
+        test.add_stonith_log_pattern("Merging stonith action off for node node3 originating from client")
+        test.add_stonith_log_pattern("Merging stonith action off for node node3 originating from client")
+        ### the pattern below signifies that both the original and duplicate operation completed
+        test.add_stonith_log_pattern("Operation off of node3 by")
+        test.add_stonith_log_pattern("Operation off of node3 by")
+        test.add_stonith_log_pattern("Operation off of node3 by")
+        test.add_stonith_log_pattern("Operation off of node3 by")
+        test.add_stonith_log_pattern("Operation off of node3 by")
+
+        ### Test that multiple mergers occur with topologies used
+        test = self.new_test("cpg_custom_merge_with_topology",
+                "Verify multiple overlapping identical fencing operations are merged with fencing levels.", 1)
+        test.add_cmd("stonith_admin", "-R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3\"")
+        test.add_cmd("stonith_admin", "-R true1  -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\" ")
+        test.add_cmd("stonith_admin", "-R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3\"")
+        test.add_cmd("stonith_admin", "-r node3 -i 1 -v false1")
+        test.add_cmd("stonith_admin", "-r node3 -i 1 -v false2")
+        test.add_cmd("stonith_admin", "-r node3 -i 2 -v true1")
+        test.add_cmd_no_wait("stonith_admin", "-F node3 -t 10")
+        test.add_cmd_no_wait("stonith_admin", "-F node3 -t 10")
+        test.add_cmd_no_wait("stonith_admin", "-F node3 -t 10")
+        test.add_cmd_no_wait("stonith_admin", "-F node3 -t 10")
+        test.add_cmd("stonith_admin", "-F node3 -t 10")
+        ### 4 mergers should occur
+        test.add_stonith_log_pattern("Merging stonith action off for node node3 originating from client")
+        test.add_stonith_log_pattern("Merging stonith action off for node node3 originating from client")
+        test.add_stonith_log_pattern("Merging stonith action off for node node3 originating from client")
+        test.add_stonith_log_pattern("Merging stonith action off for node node3 originating from client")
+        ### the pattern below signifies that both the original and duplicate operation completed
+        test.add_stonith_log_pattern("Operation off of node3 by")
+        test.add_stonith_log_pattern("Operation off of node3 by")
+        test.add_stonith_log_pattern("Operation off of node3 by")
+        test.add_stonith_log_pattern("Operation off of node3 by")
+        test.add_stonith_log_pattern("Operation off of node3 by")
+
+
+        test = self.new_test("cpg_custom_no_merge",
+                "Verify differing fencing operations are not merged", 1)
+        test.add_cmd("stonith_admin", "-R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3 node2\"")
+        test.add_cmd("stonith_admin", "-R true1  -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3 node2\" ")
+        test.add_cmd("stonith_admin", "-R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3 node2\"")
+        test.add_cmd("stonith_admin", "-r node3 -i 1 -v false1")
+        test.add_cmd("stonith_admin", "-r node3 -i 1 -v false2")
+        test.add_cmd("stonith_admin", "-r node3 -i 2 -v true1")
+        test.add_cmd_no_wait("stonith_admin", "-F node2 -t 10")
+        test.add_cmd("stonith_admin", "-F node3 -t 10")
+        test.add_stonith_negative_log_pattern("Merging stonith action off for node node3 originating from client")
+
+    def build_standalone_tests(self):
+        test_types = [
+            {
+                "prefix" : "standalone" ,
+                "use_cpg" : 0,
+            },
+            {
+                "prefix" : "cpg" ,
+                "use_cpg" : 1,
+            },
+        ]
+
+        # test what happens when all devices timeout
+        for test_type in test_types:
+            test = self.new_test("%s_fence_multi_device_failure" % test_type["prefix"],
+                    "Verify that all devices timeout, a fencing failure is returned.", test_type["use_cpg"])
+            test.add_cmd("stonith_admin", "-R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"")
+            test.add_cmd("stonith_admin", "-R false2  -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"")
+            test.add_cmd("stonith_admin", "-R false3 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"")
+            if test_type["use_cpg"] == 1:
+                test.add_expected_fail_cmd("stonith_admin", "-F node3 -t 2", 194)
+                test.add_stonith_log_pattern("remote op timeout set to 6")
+            else:
+                test.add_expected_fail_cmd("stonith_admin", "-F node3 -t 2", 55)
+
+            test.add_stonith_log_pattern("for host 'node3' with device 'false1' returned: ")
+            test.add_stonith_log_pattern("for host 'node3' with device 'false2' returned: ")
+            test.add_stonith_log_pattern("for host 'node3' with device 'false3' returned: ")
+
+        # test what happens when multiple devices can fence a node, but the first device fails.
+        for test_type in test_types:
+            test = self.new_test("%s_fence_device_failure_rollover" % test_type["prefix"],
+                    "Verify that when one fence device fails for a node, the others are tried.", test_type["use_cpg"])
+            test.add_cmd("stonith_admin", "-R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"")
+            test.add_cmd("stonith_admin", "-R true1  -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
+            test.add_cmd("stonith_admin", "-R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"")
+            test.add_cmd("stonith_admin", "-F node3 -t 2")
+
+            if test_type["use_cpg"] == 1:
+                test.add_stonith_log_pattern("remote op timeout set to 6")
+
+        # simple topology test for one device
+        for test_type in test_types:
+            if test_type["use_cpg"] == 0:
+                continue
+
+            test = self.new_test("%s_topology_simple" % test_type["prefix"],
+                    "Verify all fencing devices at a level are used.", test_type["use_cpg"])
+            test.add_cmd("stonith_admin", "-R true  -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
+
+            test.add_cmd("stonith_admin", "-r node3 -i 1 -v true")
+            test.add_cmd("stonith_admin", "-F node3 -t 2")
+
+            test.add_stonith_log_pattern("remote op timeout set to 2")
+            test.add_stonith_log_pattern("for host 'node3' with device 'true' returned: 0")
+
+
+        # add topology, delete topology, verify fencing still works 
+        for test_type in test_types:
+            if test_type["use_cpg"] == 0:
+                continue
+
+            test = self.new_test("%s_topology_add_remove" % test_type["prefix"],
+                    "Verify fencing occurrs after all topology levels are removed", test_type["use_cpg"])
+            test.add_cmd("stonith_admin", "-R true  -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
+
+            test.add_cmd("stonith_admin", "-r node3 -i 1 -v true")
+            test.add_cmd("stonith_admin", "-d node3 -i 1")
+            test.add_cmd("stonith_admin", "-F node3 -t 2")
+
+            test.add_stonith_log_pattern("remote op timeout set to 2")
+            test.add_stonith_log_pattern("for host 'node3' with device 'true' returned: 0")
+
+        # test what happens when the first fencing level has multiple devices.
+        for test_type in test_types:
+            if test_type["use_cpg"] == 0:
+                continue
+
+            test = self.new_test("%s_topology_device_fails" % test_type["prefix"],
+                    "Verify if one device in a level fails, the other is tried.", test_type["use_cpg"])
+            test.add_cmd("stonith_admin", "-R false  -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"")
+            test.add_cmd("stonith_admin", "-R true  -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
+
+            test.add_cmd("stonith_admin", "-r node3 -i 1 -v false")
+            test.add_cmd("stonith_admin", "-r node3 -i 2 -v true")
+            test.add_cmd("stonith_admin", "-F node3 -t 20")
+
+            test.add_stonith_log_pattern("remote op timeout set to 40")
+            test.add_stonith_log_pattern("for host 'node3' with device 'false' returned: -201")
+            test.add_stonith_log_pattern("for host 'node3' with device 'true' returned: 0")
+
+        # test what happens when the first fencing level fails.
+        for test_type in test_types:
+            if test_type["use_cpg"] == 0:
+                continue
+
+            test = self.new_test("%s_topology_multi_level_fails" % test_type["prefix"],
+                    "Verify if one level fails, the next leve is tried.", test_type["use_cpg"])
+            test.add_cmd("stonith_admin", "-R true1  -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
+            test.add_cmd("stonith_admin", "-R true2  -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
+            test.add_cmd("stonith_admin", "-R true3  -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
+            test.add_cmd("stonith_admin", "-R true4  -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
+            test.add_cmd("stonith_admin", "-R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"")
+            test.add_cmd("stonith_admin", "-R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"")
+
+            test.add_cmd("stonith_admin", "-r node3 -i 1 -v false1")
+            test.add_cmd("stonith_admin", "-r node3 -i 1 -v true1")
+            test.add_cmd("stonith_admin", "-r node3 -i 2 -v true2")
+            test.add_cmd("stonith_admin", "-r node3 -i 2 -v false2")
+            test.add_cmd("stonith_admin", "-r node3 -i 3 -v true3")
+            test.add_cmd("stonith_admin", "-r node3 -i 3 -v true4")
+
+            test.add_cmd("stonith_admin", "-F node3 -t 3")
+
+            test.add_stonith_log_pattern("remote op timeout set to 18")
+            test.add_stonith_log_pattern("for host 'node3' with device 'false1' returned: -201")
+            test.add_stonith_log_pattern("for host 'node3' with device 'false2' returned: -201")
+            test.add_stonith_log_pattern("for host 'node3' with device 'true3' returned: 0")
+            test.add_stonith_log_pattern("for host 'node3' with device 'true4' returned: 0")
+
+
+        # test what happens when the first fencing level had devices that no one has registered
+        for test_type in test_types:
+            if test_type["use_cpg"] == 0:
+                continue
+
+            test = self.new_test("%s_topology_missing_devices" % test_type["prefix"],
+                    "Verify topology can continue with missing devices.", test_type["use_cpg"])
+            test.add_cmd("stonith_admin", "-R true2  -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
+            test.add_cmd("stonith_admin", "-R true3  -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
+            test.add_cmd("stonith_admin", "-R true4  -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
+            test.add_cmd("stonith_admin", "-R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"")
+
+            test.add_cmd("stonith_admin", "-r node3 -i 1 -v false1")
+            test.add_cmd("stonith_admin", "-r node3 -i 1 -v true1")
+            test.add_cmd("stonith_admin", "-r node3 -i 2 -v true2")
+            test.add_cmd("stonith_admin", "-r node3 -i 2 -v false2")
+            test.add_cmd("stonith_admin", "-r node3 -i 3 -v true3")
+            test.add_cmd("stonith_admin", "-r node3 -i 3 -v true4")
+
+            test.add_cmd("stonith_admin", "-F node3 -t 2")
+
+        # Test what happens if multiple fencing levels are defined, and then the first one is removed.
+        for test_type in test_types:
+            if test_type["use_cpg"] == 0:
+                continue
+
+            test = self.new_test("%s_topology_level_removal" % test_type["prefix"],
+                    "Verify level removal works.", test_type["use_cpg"])
+            test.add_cmd("stonith_admin", "-R true1  -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
+            test.add_cmd("stonith_admin", "-R true2  -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
+            test.add_cmd("stonith_admin", "-R true3  -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
+            test.add_cmd("stonith_admin", "-R true4  -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
+            test.add_cmd("stonith_admin", "-R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"")
+            test.add_cmd("stonith_admin", "-R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node1 node2 node3\"")
+
+            test.add_cmd("stonith_admin", "-r node3 -i 1 -v false1")
+            test.add_cmd("stonith_admin", "-r node3 -i 1 -v true1")
+
+            test.add_cmd("stonith_admin", "-r node3 -i 2 -v true2")
+            test.add_cmd("stonith_admin", "-r node3 -i 2 -v false2")
+
+            test.add_cmd("stonith_admin", "-r node3 -i 3 -v true3")
+            test.add_cmd("stonith_admin", "-r node3 -i 3 -v true4")
+
+            # Now remove level 2, verify none of the devices in level two are hit.
+            test.add_cmd("stonith_admin", "-d node3 -i 2")
+
+            test.add_cmd("stonith_admin", "-F node3 -t 20")
+
+            test.add_stonith_log_pattern("remote op timeout set to 8")
+            test.add_stonith_log_pattern("for host 'node3' with device 'false1' returned: -201")
+            test.add_stonith_negative_log_pattern("for host 'node3' with device 'false2' returned: ")
+            test.add_stonith_log_pattern("for host 'node3' with device 'true3' returned: 0")
+            test.add_stonith_log_pattern("for host 'node3' with device 'true4' returned: 0")
+
+        # test the stonith builds the correct list of devices that can fence a node.
+        for test_type in test_types:
+            test = self.new_test("%s_list_devices" % test_type["prefix"],
+                    "Verify list of devices that can fence a node is correct", test_type["use_cpg"])
+            test.add_cmd("stonith_admin", "-R true1  -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\"")
+            test.add_cmd("stonith_admin", "-R true2 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
+            test.add_cmd("stonith_admin", "-R true3 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
+
+            test.add_cmd_check_stdout("stonith_admin", "-l node1 -V", "true2", "true1")
+            test.add_cmd_check_stdout("stonith_admin", "-l node1 -V", "true3", "true1")
+
+        # simple test of device monitor
+        for test_type in test_types:
+            test = self.new_test("%s_monitor" % test_type["prefix"],
+                    "Verify device is reachable", test_type["use_cpg"])
+            test.add_cmd("stonith_admin", "-R true1  -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\"")
+            test.add_cmd("stonith_admin", "-R false1  -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=node3\"")
+
+            test.add_cmd("stonith_admin", "-Q true1")
+            test.add_cmd("stonith_admin", "-Q false1")
+            test.add_expected_fail_cmd("stonith_admin", "-Q true2", 237)
+
+        # Verify monitor occurs for duration of timeout period on failure
+        for test_type in test_types:
+            test = self.new_test("%s_monitor_timeout" % test_type["prefix"],
+                    "Verify monitor uses duration of timeout period given.", test_type["use_cpg"])
+            test.add_cmd("stonith_admin", "-R true1  -a fence_dummy_monitor_fail -o \"pcmk_host_list=node3\"")
+            test.add_expected_fail_cmd("stonith_admin", "-Q true1 -t 5", 195)
+            test.add_stonith_log_pattern("Attempt 2 to execute")
+
+        # Verify monitor occurs for duration of timeout period on failure, but stops at max retries
+        for test_type in test_types:
+            test = self.new_test("%s_monitor_timeout_max_retries" % test_type["prefix"],
+                    "Verify monitor retries until max retry value or timeout is hit.", test_type["use_cpg"])
+            test.add_cmd("stonith_admin", "-R true1  -a fence_dummy_monitor_fail -o \"pcmk_host_list=node3\"")
+            test.add_expected_fail_cmd("stonith_admin", "-Q true1 -t 15",195)
+            test.add_stonith_log_pattern("Attempted to execute agent fence_dummy_monitor_fail (list) the maximum number of times")
+
+        # simple register test
+        for test_type in test_types:
+            test = self.new_test("%s_register" % test_type["prefix"],
+                    "Verify devices can be registered and un-registered", test_type["use_cpg"])
+            test.add_cmd("stonith_admin", "-R true1  -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\"")
+
+            test.add_cmd("stonith_admin", "-Q true1")
+
+            test.add_cmd("stonith_admin", "-D true1")
+
+            test.add_expected_fail_cmd("stonith_admin", "-Q true1", 237)
+
+
+        # simple reboot test
+        for test_type in test_types:
+            test = self.new_test("%s_reboot" % test_type["prefix"],
+                    "Verify devices can be rebooted", test_type["use_cpg"])
+            test.add_cmd("stonith_admin", "-R true1  -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\"")
+
+            test.add_cmd("stonith_admin", "-B node3 -t 2")
+
+            test.add_cmd("stonith_admin", "-D true1")
+
+            test.add_expected_fail_cmd("stonith_admin", "-Q true1", 237)
+
+        # test fencing history.
+        for test_type in test_types:
+            if test_type["use_cpg"] == 0:
+                continue
+            test = self.new_test("%s_fence_history" % test_type["prefix"],
+                    "Verify last fencing operation is returned.", test_type["use_cpg"])
+            test.add_cmd("stonith_admin", "-R true1  -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node3\"")
+
+            test.add_cmd("stonith_admin", "-F node3 -t 2 -V")
+
+            test.add_cmd_check_stdout("stonith_admin", "-H node3", "was able to turn off node node3", "")
+
+        # simple test of dynamic list query
+        for test_type in test_types:
+            test = self.new_test("%s_dynamic_list_query" % test_type["prefix"],
+                    "Verify dynamic list of fencing devices can be retrieved.", test_type["use_cpg"])
+            test.add_cmd("stonith_admin", "-R true1  -a fence_dummy_list")
+            test.add_cmd("stonith_admin", "-R true2  -a fence_dummy_list")
+            test.add_cmd("stonith_admin", "-R true3  -a fence_dummy_list")
+
+            test.add_cmd_check_stdout("stonith_admin", "-l fake_port_1", "3 devices found")
+
+
+        # fence using dynamic list query
+        for test_type in test_types:
+            test = self.new_test("%s_fence_dynamic_list_query" % test_type["prefix"],
+                    "Verify dynamic list of fencing devices can be retrieved.", test_type["use_cpg"])
+            test.add_cmd("stonith_admin", "-R true1  -a fence_dummy_list")
+            test.add_cmd("stonith_admin", "-R true2  -a fence_dummy_list")
+            test.add_cmd("stonith_admin", "-R true3  -a fence_dummy_list")
+
+            test.add_cmd("stonith_admin", "-F fake_port_1 -t 5 -V");
+
+        # simple test of  query using status action
+        for test_type in test_types:
+            test = self.new_test("%s_status_query" % test_type["prefix"],
+                    "Verify dynamic list of fencing devices can be retrieved.", test_type["use_cpg"])
+            test.add_cmd("stonith_admin", "-R true1  -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_check=status\"")
+            test.add_cmd("stonith_admin", "-R true2  -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_check=status\"")
+            test.add_cmd("stonith_admin", "-R true3  -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_check=status\"")
+
+            test.add_cmd_check_stdout("stonith_admin", "-l fake_port_1", "3 devices found")
+
+        # test what happens when no reboot action is advertised
+        for test_type in test_types:
+            test = self.new_test("%s_no_reboot_support" % test_type["prefix"],
+                    "Verify reboot action defaults to off when no reboot action is advertised by agent.", test_type["use_cpg"])
+            test.add_cmd("stonith_admin", "-R true1 -a fence_dummy_no_reboot -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
+            test.add_cmd("stonith_admin", "-B node1 -t 5 -V");
+            test.add_stonith_log_pattern("does not advertise support for 'reboot', performing 'off'")
+            test.add_stonith_log_pattern("with device 'true1' returned: 0 (OK)");
+
+        # make sure reboot is used when reboot action is advertised
+        for test_type in test_types:
+            test = self.new_test("%s_with_reboot_support" % test_type["prefix"],
+                    "Verify reboot action can be used when metadata advertises it.", test_type["use_cpg"])
+            test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=node1 node2 node3\"")
+            test.add_cmd("stonith_admin", "-B node1 -t 5 -V");
+            test.add_stonith_negative_log_pattern("does not advertise support for 'reboot', performing 'off'")
+            test.add_stonith_log_pattern("with device 'true1' returned: 0 (OK)");
+
+    def build_nodeid_tests(self):
+        our_uname = output_from_command("uname -n")
+        if our_uname:
+            our_uname = our_uname[0]
+
+        ### verify nodeid is supplied when nodeid is in the metadata parameters
+        test = self.new_test("cpg_supply_nodeid",
+                "Verify nodeid is given when fence agent has nodeid as parameter", 1)
+
+        test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s\"" % (our_uname))
+        test.add_cmd("stonith_admin", "-F %s -t 3" % (our_uname))
+        test.add_stonith_log_pattern("For stonith action (off) for victim %s, adding nodeid" % (our_uname))
+
+        ### verify nodeid is _NOT_ supplied when nodeid is not in the metadata parameters
+        test = self.new_test("cpg_do_not_supply_nodeid",
+                "Verify nodeid is _NOT_ given when fence agent does not have nodeid as parameter", 1)
+
+        test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s\"" % (our_uname))
+        test.add_cmd("stonith_admin", "-F %s -t 3" % (our_uname))
+        test.add_stonith_negative_log_pattern("For stonith action (off) for victim %s, adding nodeid" % (our_uname))
+
+        ### verify nodeid use doesn't explode standalone mode
+        test = self.new_test("standalone_do_not_supply_nodeid",
+                "Verify nodeid in metadata parameter list doesn't kill standalone mode", 0)
+
+        test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s\"" % (our_uname))
+        test.add_cmd("stonith_admin", "-F %s -t 3" % (our_uname))
+        test.add_stonith_negative_log_pattern("For stonith action (off) for victim %s, adding nodeid" % (our_uname))
+
+
+    def build_unfence_tests(self):
+        our_uname = output_from_command("uname -n")
+        if our_uname:
+            our_uname = our_uname[0]
+
+        ### verify unfencing using automatic unfencing
+        test = self.new_test("cpg_unfence_required_1",
+                "Verify require unfencing on all devices when automatic=true in agent's metadata", 1)
+        test.add_cmd("stonith_admin", "-R true1 -a fence_dummy_automatic_unfence -o \"mode=pass\" -o \"pcmk_host_list=%s\"" % (our_uname))
+        test.add_cmd("stonith_admin", "-R true2 -a fence_dummy_automatic_unfence -o \"mode=pass\" -o \"pcmk_host_list=%s\"" % (our_uname))
+        test.add_cmd("stonith_admin", "-U %s -t 3" % (our_uname))
+        # both devices should be executed
+        test.add_stonith_log_pattern("with device 'true1' returned: 0 (OK)");
+        test.add_stonith_log_pattern("with device 'true2' returned: 0 (OK)");
+
+
+        ### verify unfencing using automatic unfencing fails if any of the required agents fail
+        test = self.new_test("cpg_unfence_required_2",
+                "Verify require unfencing on all devices when automatic=true in agent's metadata", 1)
+        test.add_cmd("stonith_admin", "-R true1 -a fence_dummy_automatic_unfence -o \"mode=pass\" -o \"pcmk_host_list=%s\"" % (our_uname))
+        test.add_cmd("stonith_admin", "-R true2 -a fence_dummy_automatic_unfence -o \"mode=fail\" -o \"pcmk_host_list=%s\"" % (our_uname))
+        test.add_expected_fail_cmd("stonith_admin", "-U %s -t 6" % (our_uname), 143)
+
+        ### verify unfencing using automatic devices with topology
+        test = self.new_test("cpg_unfence_required_3",
+                "Verify require unfencing on all devices even when required devices are at different topology levels", 1)
+        test.add_cmd("stonith_admin", "-R true1 -a fence_dummy_automatic_unfence -o \"mode=pass\" -o \"pcmk_host_list=%s node3\"" % (our_uname))
+        test.add_cmd("stonith_admin", "-R true2 -a fence_dummy_automatic_unfence -o \"mode=pass\" -o \"pcmk_host_list=%s node3\"" % (our_uname))
+        test.add_cmd("stonith_admin", "-r %s -i 1 -v true1" % (our_uname))
+        test.add_cmd("stonith_admin", "-r %s -i 2 -v true2" % (our_uname))
+        test.add_cmd("stonith_admin", "-U %s -t 3" % (our_uname))
+        test.add_stonith_log_pattern("with device 'true1' returned: 0 (OK)");
+        test.add_stonith_log_pattern("with device 'true2' returned: 0 (OK)");
+
+
+        ### verify unfencing using automatic devices with topology
+        test = self.new_test("cpg_unfence_required_4",
+                "Verify all required devices are executed even with topology levels fail.", 1)
+        test.add_cmd("stonith_admin", "-R true1 -a fence_dummy_automatic_unfence -o \"mode=pass\" -o \"pcmk_host_list=%s node3\"" % (our_uname))
+        test.add_cmd("stonith_admin", "-R true2 -a fence_dummy_automatic_unfence -o \"mode=pass\" -o \"pcmk_host_list=%s node3\"" % (our_uname))
+        test.add_cmd("stonith_admin", "-R true3 -a fence_dummy_automatic_unfence -o \"mode=pass\" -o \"pcmk_host_list=%s node3\"" % (our_uname))
+        test.add_cmd("stonith_admin", "-R true4 -a fence_dummy_automatic_unfence -o \"mode=pass\" -o \"pcmk_host_list=%s node3\"" % (our_uname))
+        test.add_cmd("stonith_admin", "-R false1 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=%s node3\"" % (our_uname))
+        test.add_cmd("stonith_admin", "-R false2 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=%s node3\"" % (our_uname))
+        test.add_cmd("stonith_admin", "-R false3 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=%s node3\"" % (our_uname))
+        test.add_cmd("stonith_admin", "-R false4 -a fence_dummy -o \"mode=fail\" -o \"pcmk_host_list=%s node3\"" % (our_uname))
+        test.add_cmd("stonith_admin", "-r %s -i 1 -v true1" % (our_uname))
+        test.add_cmd("stonith_admin", "-r %s -i 1 -v false1" % (our_uname))
+        test.add_cmd("stonith_admin", "-r %s -i 2 -v false2" % (our_uname))
+        test.add_cmd("stonith_admin", "-r %s -i 2 -v true2" % (our_uname))
+        test.add_cmd("stonith_admin", "-r %s -i 2 -v false3" % (our_uname))
+        test.add_cmd("stonith_admin", "-r %s -i 2 -v true3" % (our_uname))
+        test.add_cmd("stonith_admin", "-r %s -i 3 -v false4" % (our_uname))
+        test.add_cmd("stonith_admin", "-r %s -i 4 -v true4" % (our_uname))
+        test.add_cmd("stonith_admin", "-U %s -t 3" % (our_uname))
+        test.add_stonith_log_pattern("with device 'true1' returned: 0 (OK)");
+        test.add_stonith_log_pattern("with device 'true2' returned: 0 (OK)");
+        test.add_stonith_log_pattern("with device 'true3' returned: 0 (OK)");
+        test.add_stonith_log_pattern("with device 'true4' returned: 0 (OK)");
+
+        ### verify unfencing using on_target device
+        test = self.new_test("cpg_unfence_on_target_1",
+                "Verify unfencing with on_target = true", 1)
+        test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s\"" % (our_uname))
+        test.add_cmd("stonith_admin", "-U %s -t 3" % (our_uname))
+        test.add_stonith_log_pattern("(on) to be executed on the target node")
+
+
+        ### verify failure of unfencing using on_target device
+        test = self.new_test("cpg_unfence_on_target_2",
+                "Verify failure unfencing with on_target = true", 1)
+        test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s node_fake_1234\"" % (our_uname))
+        test.add_expected_fail_cmd("stonith_admin", "-U node_fake_1234 -t 3", 237)
+        test.add_stonith_log_pattern("(on) to be executed on the target node")
+
+
+        ### verify unfencing using on_target device with topology
+        test = self.new_test("cpg_unfence_on_target_3",
+                "Verify unfencing with on_target = true using topology", 1)
+
+        test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s node3\"" % (our_uname))
+        test.add_cmd("stonith_admin", "-R true2 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s node3\"" % (our_uname))
+
+        test.add_cmd("stonith_admin", "-r %s -i 1 -v true1" % (our_uname))
+        test.add_cmd("stonith_admin", "-r %s -i 2 -v true2" % (our_uname))
+
+        test.add_cmd("stonith_admin", "-U %s -t 3" % (our_uname))
+        test.add_stonith_log_pattern("(on) to be executed on the target node")
+
+        ### verify unfencing using on_target device with topology fails when victim node doesn't exist
+        test = self.new_test("cpg_unfence_on_target_4",
+                "Verify unfencing failure with on_target = true using topology", 1)
+
+        test.add_cmd("stonith_admin", "-R true1 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s node_fake\"" % (our_uname))
+        test.add_cmd("stonith_admin", "-R true2 -a fence_dummy -o \"mode=pass\" -o \"pcmk_host_list=%s node_fake\"" % (our_uname))
+
+        test.add_cmd("stonith_admin", "-r node_fake -i 1 -v true1")
+        test.add_cmd("stonith_admin", "-r node_fake -i 2 -v true2")
+
+        test.add_expected_fail_cmd("stonith_admin", "-U node_fake -t 3", 237)
+        test.add_stonith_log_pattern("(on) to be executed on the target node")
+
+    def build_remap_tests(self):
+        test = self.new_test("cpg_remap_simple",
+                             "Verify sequential topology reboot is remapped to all-off-then-all-on", 1)
+        test.add_cmd("stonith_admin",
+                     """-R true1 -a fence_dummy -o "mode=pass" -o "pcmk_host_list=node_fake" """
+                     """-o "pcmk_off_timeout=1" -o "pcmk_reboot_timeout=10" """)
+        test.add_cmd("stonith_admin",
+                     """-R true2 -a fence_dummy -o "mode=pass" -o "pcmk_host_list=node_fake" """
+                     """-o "pcmk_off_timeout=2" -o "pcmk_reboot_timeout=20" """)
+        test.add_cmd("stonith_admin", "-r node_fake -i 1 -v true1 -v true2")
+        test.add_cmd("stonith_admin", "-B node_fake -t 5")
+        test.add_stonith_log_pattern("Remapping multiple-device reboot of node_fake")
+        # timeout should be sum of off timeouts (1+2=3), not reboot timeouts (10+20=30)
+        test.add_stonith_log_pattern("remote op timeout set to 3 for fencing of node node_fake")
+        test.add_stonith_log_pattern("perform op off node_fake with true1")
+        test.add_stonith_log_pattern("perform op off node_fake with true2")
+        test.add_stonith_log_pattern("Remapped off of node_fake complete, remapping to on")
+        # fence_dummy sets "on" as an on_target action
+        test.add_stonith_log_pattern("Ignoring true1 'on' failure (no capable peers) for node_fake")
+        test.add_stonith_log_pattern("Ignoring true2 'on' failure (no capable peers) for node_fake")
+        test.add_stonith_log_pattern("Undoing remap of reboot of node_fake")
+
+        test = self.new_test("cpg_remap_automatic",
+                             "Verify remapped topology reboot skips automatic 'on'", 1)
+        test.add_cmd("stonith_admin",
+                     """-R true1 -a fence_dummy_automatic_unfence """
+                     """-o "mode=pass" -o "pcmk_host_list=node_fake" """)
+        test.add_cmd("stonith_admin",
+                     """-R true2 -a fence_dummy_automatic_unfence """
+                     """-o "mode=pass" -o "pcmk_host_list=node_fake" """)
+        test.add_cmd("stonith_admin", "-r node_fake -i 1 -v true1 -v true2")
+        test.add_cmd("stonith_admin", "-B node_fake -t 5")
+        test.add_stonith_log_pattern("Remapping multiple-device reboot of node_fake")
+        test.add_stonith_log_pattern("perform op off node_fake with true1")
+        test.add_stonith_log_pattern("perform op off node_fake with true2")
+        test.add_stonith_log_pattern("Remapped off of node_fake complete, remapping to on")
+        test.add_stonith_log_pattern("Undoing remap of reboot of node_fake")
+        test.add_stonith_negative_log_pattern("perform op on node_fake with")
+        test.add_stonith_negative_log_pattern("'on' failure")
+
+        test = self.new_test("cpg_remap_complex_1",
+                "Verify remapped topology reboot in second level works if non-remapped first level fails", 1)
+        test.add_cmd("stonith_admin", """-R false1 -a fence_dummy -o "mode=fail" -o "pcmk_host_list=node_fake" """)
+        test.add_cmd("stonith_admin", """-R true1 -a fence_dummy -o "mode=pass" -o "pcmk_host_list=node_fake" """)
+        test.add_cmd("stonith_admin", """-R true2 -a fence_dummy -o "mode=pass" -o "pcmk_host_list=node_fake" """)
+        test.add_cmd("stonith_admin", "-r node_fake -i 1 -v false1")
+        test.add_cmd("stonith_admin", "-r node_fake -i 2 -v true1 -v true2")
+        test.add_cmd("stonith_admin", "-B node_fake -t 5")
+        test.add_stonith_log_pattern("perform op reboot node_fake with false1")
+        test.add_stonith_log_pattern("Remapping multiple-device reboot of node_fake")
+        test.add_stonith_log_pattern("perform op off node_fake with true1")
+        test.add_stonith_log_pattern("perform op off node_fake with true2")
+        test.add_stonith_log_pattern("Remapped off of node_fake complete, remapping to on")
+        test.add_stonith_log_pattern("Ignoring true1 'on' failure (no capable peers) for node_fake")
+        test.add_stonith_log_pattern("Ignoring true2 'on' failure (no capable peers) for node_fake")
+        test.add_stonith_log_pattern("Undoing remap of reboot of node_fake")
+
+        test = self.new_test("cpg_remap_complex_2",
+                "Verify remapped topology reboot failure in second level proceeds to third level", 1)
+        test.add_cmd("stonith_admin", """-R false1 -a fence_dummy -o "mode=fail" -o "pcmk_host_list=node_fake" """)
+        test.add_cmd("stonith_admin", """-R false2 -a fence_dummy -o "mode=fail" -o "pcmk_host_list=node_fake" """)
+        test.add_cmd("stonith_admin", """-R true1 -a fence_dummy -o "mode=pass" -o "pcmk_host_list=node_fake" """)
+        test.add_cmd("stonith_admin", """-R true2 -a fence_dummy -o "mode=pass" -o "pcmk_host_list=node_fake" """)
+        test.add_cmd("stonith_admin", """-R true3 -a fence_dummy -o "mode=pass" -o "pcmk_host_list=node_fake" """)
+        test.add_cmd("stonith_admin", "-r node_fake -i 1 -v false1")
+        test.add_cmd("stonith_admin", "-r node_fake -i 2 -v true1 -v false2 -v true3")
+        test.add_cmd("stonith_admin", "-r node_fake -i 3 -v true2")
+        test.add_cmd("stonith_admin", "-B node_fake -t 5")
+        test.add_stonith_log_pattern("perform op reboot node_fake with false1")
+        test.add_stonith_log_pattern("Remapping multiple-device reboot of node_fake")
+        test.add_stonith_log_pattern("perform op off node_fake with true1")
+        test.add_stonith_log_pattern("perform op off node_fake with false2")
+        test.add_stonith_log_pattern("Attempted to execute agent fence_dummy (off) the maximum number of times")
+        test.add_stonith_log_pattern("Undoing remap of reboot of node_fake")
+        test.add_stonith_log_pattern("perform op reboot node_fake with true2")
+        test.add_stonith_negative_log_pattern("node_fake with true3")
+
+    def setup_environment(self, use_corosync):
+        if self.autogen_corosync_cfg and use_corosync:
+            corosync_conf = ("""
 totem {
         version: 2
         crypto_cipher: none
@@ -908,15 +984,15 @@ logging {
 }
 """)
 
-			os.system("cat <<-END >>/etc/corosync/corosync.conf\n%s\nEND" % (corosync_conf))
+            os.system("cat <<-END >>/etc/corosync/corosync.conf\n%s\nEND" % (corosync_conf))
 
 
-		if use_corosync:
-			### make sure we are in control ###
-			self.stop_corosync()
-			self.start_corosync()
+        if use_corosync:
+            ### make sure we are in control ###
+            self.stop_corosync()
+            self.start_corosync()
 
-		monitor_fail_agent = ("""#!/usr/bin/python
+        monitor_fail_agent = ("""#!/usr/bin/python
 import sys
 def main():
     for line in sys.stdin.readlines():
@@ -927,7 +1003,7 @@ if __name__ == "__main__":
     main()
 """)
 
-		dynamic_list_agent = ("""#!/usr/bin/python
+        dynamic_list_agent = ("""#!/usr/bin/python
 import sys
 def main():
     for line in sys.stdin.readlines():
@@ -942,140 +1018,141 @@ if __name__ == "__main__":
 """)
 
 
-		os.system("cat <<-END >>/usr/sbin/fence_dummy_list\n%s\nEND" % (dynamic_list_agent))
-		os.system("chmod 711 /usr/sbin/fence_dummy_list")
+        os.system("cat <<-END >>/usr/sbin/fence_dummy_list\n%s\nEND" % (dynamic_list_agent))
+        os.system("chmod 711 /usr/sbin/fence_dummy_list")
 
-		os.system("cat <<-END >>/usr/sbin/fence_dummy_monitor_fail\n%s\nEND" % (monitor_fail_agent))
-		os.system("chmod 711 /usr/sbin/fence_dummy_monitor_fail")
+        os.system("cat <<-END >>/usr/sbin/fence_dummy_monitor_fail\n%s\nEND" % (monitor_fail_agent))
+        os.system("chmod 711 /usr/sbin/fence_dummy_monitor_fail")
 
-		os.system("cp /usr/share/pacemaker/tests/cts/fence_dummy /usr/sbin/fence_dummy")
+        os.system("cp /usr/share/pacemaker/tests/cts/fence_dummy /usr/sbin/fence_dummy")
 
-		# modifies dummy agent to do require unfencing 
-		os.system("cat /usr/share/pacemaker/tests/cts/fence_dummy  | sed 's/on_target=/automatic=/g' > /usr/sbin/fence_dummy_automatic_unfence");
-		os.system("chmod 711 /usr/sbin/fence_dummy_automatic_unfence")
+        # modifies dummy agent to do require unfencing 
+        os.system("cat /usr/share/pacemaker/tests/cts/fence_dummy  | sed 's/on_target=/automatic=/g' > /usr/sbin/fence_dummy_automatic_unfence");
+        os.system("chmod 711 /usr/sbin/fence_dummy_automatic_unfence")
 
-		# modifies dummy agent to not advertise reboot
-		os.system("cat /usr/share/pacemaker/tests/cts/fence_dummy  | sed 's/^.*<action.*name.*reboot.*>.*//g' > /usr/sbin/fence_dummy_no_reboot");
-		os.system("chmod 711 /usr/sbin/fence_dummy_no_reboot")
+        # modifies dummy agent to not advertise reboot
+        os.system("cat /usr/share/pacemaker/tests/cts/fence_dummy  | sed 's/^.*<action.*name.*reboot.*>.*//g' > /usr/sbin/fence_dummy_no_reboot");
+        os.system("chmod 711 /usr/sbin/fence_dummy_no_reboot")
 
-	def cleanup_environment(self, use_corosync):
-		if use_corosync:
-			self.stop_corosync()
+    def cleanup_environment(self, use_corosync):
+        if use_corosync:
+            self.stop_corosync()
 
-			if self.verbose and os.path.exists('/var/log/corosync.log'):
-				print "Corosync output"
-				f = open('/var/log/corosync.log', 'r')
-				for line in f.readlines():
-					print line.strip()
-				os.remove('/var/log/corosync.log')
+            if self.verbose and os.path.exists('/var/log/corosync.log'):
+                print "Corosync output"
+                f = open('/var/log/corosync.log', 'r')
+                for line in f.readlines():
+                    print line.strip()
+                os.remove('/var/log/corosync.log')
 
-		if self.autogen_corosync_cfg:
-			os.system("rm -f /etc/corosync/corosync.conf")
+        if self.autogen_corosync_cfg:
+            os.system("rm -f /etc/corosync/corosync.conf")
 
-		os.system("rm -f /usr/sbin/fence_dummy_monitor_fail")
-		os.system("rm -f /usr/sbin/fence_dummy_list")
-		os.system("rm -f /usr/sbin/fence_dummy")
-		os.system("rm -f /usr/sbin/fence_dummy_automatic_unfence")
-		os.system("rm -f /usr/sbin/fence_dummy_no_reboot")
+        os.system("rm -f /usr/sbin/fence_dummy_monitor_fail")
+        os.system("rm -f /usr/sbin/fence_dummy_list")
+        os.system("rm -f /usr/sbin/fence_dummy")
+        os.system("rm -f /usr/sbin/fence_dummy_automatic_unfence")
+        os.system("rm -f /usr/sbin/fence_dummy_no_reboot")
 
 class TestOptions:
-	def __init__(self):
-		self.options = {}
-		self.options['list-tests'] = 0
-		self.options['run-all'] = 1
-		self.options['run-only'] = ""
-		self.options['run-only-pattern'] = ""
-		self.options['verbose'] = 0
-		self.options['invalid-arg'] = ""
-		self.options['cpg-only'] = 0
-		self.options['no-cpg'] = 0
-		self.options['show-usage'] = 0
-
-	def build_options(self, argv):
-		args = argv[1:]
-		skip = 0
-		for i in range(0, len(args)):
-			if skip:
-				skip = 0
-				continue
-			elif args[i] == "-h" or args[i] == "--help":
-				self.options['show-usage'] = 1
-			elif args[i] == "-l" or args[i] == "--list-tests":
-				self.options['list-tests'] = 1
-			elif args[i] == "-V" or args[i] == "--verbose":
-				self.options['verbose'] = 1
-			elif args[i] == "-n" or args[i] == "--no-cpg":
-				self.options['no-cpg'] = 1
-			elif args[i] == "-c" or args[i] == "--cpg-only":
-				self.options['cpg-only'] = 1
-			elif args[i] == "-r" or args[i] == "--run-only":
-				self.options['run-only'] = args[i+1]
-				skip = 1
-			elif args[i] == "-p" or args[i] == "--run-only-pattern":
-				self.options['run-only-pattern'] = args[i+1]
-				skip = 1
-
-	def show_usage(self):
-		print "usage: " + sys.argv[0] + " [options]"
-		print "If no options are provided, all tests will run"
-		print "Options:"
-		print "\t [--help | -h]                        Show usage"
-		print "\t [--list-tests | -l]                  Print out all registered tests."
-		print "\t [--cpg-only | -c]                    Only run tests that require corosync."
-		print "\t [--no-cpg | -n]                      Only run tests that do not require corosync"
-		print "\t [--run-only | -r 'testname']         Run a specific test"
-		print "\t [--verbose | -V]                     Verbose output"
-		print "\t [--run-only-pattern | -p 'string']   Run only tests containing the string value"
-		print "\n\tExample: Run only the test 'start_top'"
-		print "\t\t python ./regression.py --run-only start_stop"
-		print "\n\tExample: Run only the tests with the string 'systemd' present in them"
-		print "\t\t python ./regression.py --run-only-pattern systemd"
+    def __init__(self):
+        self.options = {}
+        self.options['list-tests'] = 0
+        self.options['run-all'] = 1
+        self.options['run-only'] = ""
+        self.options['run-only-pattern'] = ""
+        self.options['verbose'] = 0
+        self.options['invalid-arg'] = ""
+        self.options['cpg-only'] = 0
+        self.options['no-cpg'] = 0
+        self.options['show-usage'] = 0
+
+    def build_options(self, argv):
+        args = argv[1:]
+        skip = 0
+        for i in range(0, len(args)):
+            if skip:
+                skip = 0
+                continue
+            elif args[i] == "-h" or args[i] == "--help":
+                self.options['show-usage'] = 1
+            elif args[i] == "-l" or args[i] == "--list-tests":
+                self.options['list-tests'] = 1
+            elif args[i] == "-V" or args[i] == "--verbose":
+                self.options['verbose'] = 1
+            elif args[i] == "-n" or args[i] == "--no-cpg":
+                self.options['no-cpg'] = 1
+            elif args[i] == "-c" or args[i] == "--cpg-only":
+                self.options['cpg-only'] = 1
+            elif args[i] == "-r" or args[i] == "--run-only":
+                self.options['run-only'] = args[i+1]
+                skip = 1
+            elif args[i] == "-p" or args[i] == "--run-only-pattern":
+                self.options['run-only-pattern'] = args[i+1]
+                skip = 1
+
+    def show_usage(self):
+        print "usage: " + sys.argv[0] + " [options]"
+        print "If no options are provided, all tests will run"
+        print "Options:"
+        print "\t [--help | -h]                        Show usage"
+        print "\t [--list-tests | -l]                  Print out all registered tests."
+        print "\t [--cpg-only | -c]                    Only run tests that require corosync."
+        print "\t [--no-cpg | -n]                      Only run tests that do not require corosync"
+        print "\t [--run-only | -r 'testname']         Run a specific test"
+        print "\t [--verbose | -V]                     Verbose output"
+        print "\t [--run-only-pattern | -p 'string']   Run only tests containing the string value"
+        print "\n\tExample: Run only the test 'start_top'"
+        print "\t\t python ./regression.py --run-only start_stop"
+        print "\n\tExample: Run only the tests with the string 'systemd' present in them"
+        print "\t\t python ./regression.py --run-only-pattern systemd"
 
 def main(argv):
-	o = TestOptions()
-	o.build_options(argv)
-
-	use_corosync = 1
-
-	tests = Tests(o.options['verbose'])
-	tests.build_standalone_tests()
-	tests.build_custom_timeout_tests()
-	tests.build_api_sanity_tests()
-	tests.build_fence_merge_tests()
-	tests.build_unfence_tests()
-	tests.build_nodeid_tests()
-
-	if o.options['list-tests']:
-		tests.print_list()
-		sys.exit(0)
-	elif o.options['show-usage']:
-		o.show_usage()
-		sys.exit(0)
-
-	print "Starting ..."
-
-	if o.options['no-cpg']:
-		use_corosync = 0
-
-	tests.setup_environment(use_corosync)
-
-	if o.options['run-only-pattern'] != "":
-		tests.run_tests_matching(o.options['run-only-pattern'])
-		tests.print_results()
-	elif o.options['run-only'] != "":
-		tests.run_single(o.options['run-only'])
-		tests.print_results()
-	elif o.options['no-cpg']:
-		tests.run_no_cpg()
-		tests.print_results()
-	elif o.options['cpg-only']:
-		tests.run_cpg_only()
-		tests.print_results()
-	else:
-		tests.run_tests()
-		tests.print_results()
-
-	tests.cleanup_environment(use_corosync)
-	tests.exit()
+    o = TestOptions()
+    o.build_options(argv)
+
+    use_corosync = 1
+
+    tests = Tests(o.options['verbose'])
+    tests.build_standalone_tests()
+    tests.build_custom_timeout_tests()
+    tests.build_api_sanity_tests()
+    tests.build_fence_merge_tests()
+    tests.build_unfence_tests()
+    tests.build_nodeid_tests()
+    tests.build_remap_tests()
+
+    if o.options['list-tests']:
+        tests.print_list()
+        sys.exit(0)
+    elif o.options['show-usage']:
+        o.show_usage()
+        sys.exit(0)
+
+    print "Starting ..."
+
+    if o.options['no-cpg']:
+        use_corosync = 0
+
+    tests.setup_environment(use_corosync)
+
+    if o.options['run-only-pattern'] != "":
+        tests.run_tests_matching(o.options['run-only-pattern'])
+        tests.print_results()
+    elif o.options['run-only'] != "":
+        tests.run_single(o.options['run-only'])
+        tests.print_results()
+    elif o.options['no-cpg']:
+        tests.run_no_cpg()
+        tests.print_results()
+    elif o.options['cpg-only']:
+        tests.run_cpg_only()
+        tests.print_results()
+    else:
+        tests.run_tests()
+        tests.print_results()
+
+    tests.cleanup_environment(use_corosync)
+    tests.exit()
 if __name__=="__main__":
-	main(sys.argv)
+    main(sys.argv)
diff --git a/fencing/remote.c b/fencing/remote.c
index a568035..2c00b5f 100644
--- a/fencing/remote.c
+++ b/fencing/remote.c
@@ -47,17 +47,37 @@
 
 #define TIMEOUT_MULTIPLY_FACTOR 1.2
 
+/* When one stonithd queries its peers for devices able to handle a fencing
+ * request, each peer will reply with a list of such devices available to it.
+ * Each reply will be parsed into a st_query_result_t, with each device's
+ * information kept in a device_properties_t.
+ */
+
+typedef struct device_properties_s {
+    /* Whether access to this device has been verified */
+    gboolean verified;
+
+    /* The remaining members are indexed by the operation's "phase" */
+
+    /* Whether this device has been executed in each phase */
+    gboolean executed[3];
+    /* Whether this device is disallowed from executing in each phase */
+    gboolean disallowed[3];
+    /* Action-specific timeout for each phase */
+    int custom_action_timeout[3];
+    /* Action-specific maximum random delay for each phase */
+    int delay_max[3];
+} device_properties_t;
+
 typedef struct st_query_result_s {
+    /* Name of peer that sent this result */
     char *host;
-    int devices;
-    /* only try peers for non-topology based operations once */
+    /* Only try peers for non-topology based operations once */
     gboolean tried;
-    GListPtr device_list;
-    GHashTable *custom_action_timeouts;
-    GHashTable *delay_maxes;
-    /* Subset of devices that peer has verified connectivity on */
-    GHashTable *verified_devices;
-
+    /* Number of entries in the devices table */
+    int ndevices;
+    /* Devices available to this host that are capable of fencing the target */
+    GHashTable *devices;
 } st_query_result_t;
 
 GHashTable *remote_op_list = NULL;
@@ -67,8 +87,8 @@ extern xmlNode *stonith_create_op(int call_id, const char *token, const char *op
                                   int call_options);
 
 static void report_timeout_period(remote_fencing_op_t * op, int op_timeout);
-static int get_op_total_timeout(remote_fencing_op_t * op, st_query_result_t * chosen_peer,
-                                int default_timeout);
+static int get_op_total_timeout(const remote_fencing_op_t *op,
+                                const st_query_result_t *chosen_peer);
 
 static gint
 sort_strings(gconstpointer a, gconstpointer b)
@@ -83,15 +103,126 @@ free_remote_query(gpointer data)
         st_query_result_t *query = data;
 
         crm_trace("Free'ing query result from %s", query->host);
+        g_hash_table_destroy(query->devices);
         free(query->host);
-        g_list_free_full(query->device_list, free);
-        g_hash_table_destroy(query->custom_action_timeouts);
-        g_hash_table_destroy(query->delay_maxes);
-        g_hash_table_destroy(query->verified_devices);
         free(query);
     }
 }
 
+struct peer_count_data {
+    const remote_fencing_op_t *op;
+    gboolean verified_only;
+    int count;
+};
+
+/*!
+ * \internal
+ * \brief Increment a counter if a device has not been executed yet
+ *
+ * \param[in] key        Device ID (ignored)
+ * \param[in] value      Device properties
+ * \param[in] user_data  Peer count data
+ */
+static void
+count_peer_device(gpointer key, gpointer value, gpointer user_data)
+{
+    device_properties_t *props = (device_properties_t*)value;
+    struct peer_count_data *data = user_data;
+
+    if (!props->executed[data->op->phase]
+        && (!data->verified_only || props->verified)) {
+        ++(data->count);
+    }
+}
+
+/*!
+ * \internal
+ * \brief Check the number of available devices in a peer's query results
+ *
+ * \param[in] op             Operation that results are for
+ * \param[in] peer           Peer to count
+ * \param[in] verified_only  Whether to count only verified devices
+ *
+ * \return Number of devices available to peer that were not already executed
+ */
+static int
+count_peer_devices(const remote_fencing_op_t *op, const st_query_result_t *peer,
+                   gboolean verified_only)
+{
+    struct peer_count_data data;
+
+    data.op = op;
+    data.verified_only = verified_only;
+    data.count = 0;
+    if (peer) {
+        g_hash_table_foreach(peer->devices, count_peer_device, &data);
+    }
+    return data.count;
+}
+
+/*!
+ * \internal
+ * \brief Search for a device in a query result
+ *
+ * \param[in] op      Operation that result is for
+ * \param[in] peer    Query result for a peer
+ * \param[in] device  Device ID to search for
+ *
+ * \return Device properties if found, NULL otherwise
+ */
+static device_properties_t *
+find_peer_device(const remote_fencing_op_t *op, const st_query_result_t *peer,
+                 const char *device)
+{
+    device_properties_t *props = g_hash_table_lookup(peer->devices, device);
+
+    return (props && !props->executed[op->phase]
+           && !props->disallowed[op->phase])? props : NULL;
+}
+
+/*!
+ * \internal
+ * \brief Find a device in a peer's device list and mark it as executed
+ *
+ * \param[in]     op                     Operation that peer result is for
+ * \param[in,out] peer                   Peer with results to search
+ * \param[in]     device                 ID of device to mark as done
+ * \param[in]     verified_devices_only  Only consider verified devices
+ *
+ * \return TRUE if device was found and marked, FALSE otherwise
+ */
+static gboolean
+grab_peer_device(const remote_fencing_op_t *op, st_query_result_t *peer,
+                 const char *device, gboolean verified_devices_only)
+{
+    device_properties_t *props = find_peer_device(op, peer, device);
+
+    if ((props == NULL) || (verified_devices_only && !props->verified)) {
+        return FALSE;
+    }
+
+    crm_trace("Removing %s from %s (%d remaining)",
+              device, peer->host, count_peer_devices(op, peer, FALSE));
+    props->executed[op->phase] = TRUE;
+    return TRUE;
+}
+
+/*
+ * \internal
+ * \brief Free the list of required devices for a particular phase
+ *
+ * \param[in,out] op     Operation to modify
+ * \param[in]     phase  Phase to modify
+ */
+static void
+free_required_list(remote_fencing_op_t *op, enum st_remap_phase phase)
+{
+    if (op->required_list[phase]) {
+        g_list_free_full(op->required_list[phase], free);
+        op->required_list[phase] = NULL;
+    }
+}
+
 static void
 clear_remote_op_timers(remote_fencing_op_t * op)
 {
@@ -137,13 +268,100 @@ free_remote_op(gpointer data)
         g_list_free_full(op->devices_list, free);
         op->devices_list = NULL;
     }
-    if (op->required_list) {
-        g_list_free_full(op->required_list, free);
-        op->required_list = NULL;
-    }
+    free_required_list(op, st_phase_requested);
+    free_required_list(op, st_phase_off);
+    free_required_list(op, st_phase_on);
     free(op);
 }
 
+/*
+ * \internal
+ * \brief Return an operation's originally requested action (before any remap)
+ *
+ * \param[in] op  Operation to check
+ *
+ * \return Operation's original action
+ */
+static const char *
+op_requested_action(const remote_fencing_op_t *op)
+{
+    return ((op->phase > st_phase_requested)? "reboot" : op->action);
+}
+
+/*
+ * \internal
+ * \brief Remap a "reboot" operation to the "off" phase
+ *
+ * \param[in,out] op      Operation to remap
+ */
+static void
+op_phase_off(remote_fencing_op_t *op)
+{
+    crm_info("Remapping multiple-device reboot of %s (%s) to off",
+             op->target, op->id);
+    op->phase = st_phase_off;
+
+    /* Happily, "off" and "on" are shorter than "reboot", so we can reuse the
+     * memory allocation at each phase.
+     */
+    strcpy(op->action, "off");
+}
+
+/*!
+ * \internal
+ * \brief Advance a remapped reboot operation to the "on" phase
+ *
+ * \param[in,out] op  Operation to remap
+ */
+static void
+op_phase_on(remote_fencing_op_t *op)
+{
+    GListPtr iter = NULL;
+
+    crm_info("Remapped off of %s complete, remapping to on for %s.%.8s",
+             op->target, op->client_name, op->id);
+    op->phase = st_phase_on;
+    strcpy(op->action, "on");
+
+    /* Any devices that are required for "on" will be automatically executed by
+     * the cluster when the node next joins, so we skip them here.
+     */
+    for (iter = op->required_list[op->phase]; iter != NULL; iter = iter->next) {
+        GListPtr match = g_list_find_custom(op->devices_list, iter->data,
+                                            sort_strings);
+
+        if (match) {
+            op->devices_list = g_list_remove(op->devices_list, match->data);
+        }
+    }
+
+    /* We know this level will succeed, because phase 1 completed successfully
+     * and we ignore any errors from phase 2. So we can free the required list,
+     * which will keep them from being executed after the device list is done.
+     */
+    free_required_list(op, op->phase);
+
+    /* Rewind device list pointer */
+    op->devices = op->devices_list;
+}
+
+/*!
+ * \internal
+ * \brief Reset a remapped reboot operation
+ *
+ * \param[in,out] op  Operation to reset
+ */
+static void
+undo_op_remap(remote_fencing_op_t *op)
+{
+    if (op->phase > 0) {
+        crm_info("Undoing remap of reboot of %s for %s.%.8s",
+                 op->target, op->client_name, op->id);
+        op->phase = st_phase_requested;
+        strcpy(op->action, "reboot");
+    }
+}
+
 static xmlNode *
 create_op_done_notify(remote_fencing_op_t * op, int rc)
 {
@@ -271,6 +489,7 @@ remote_op_done(remote_fencing_op_t * op, xmlNode * data, int rc, int dup)
 
     op->completed = time(NULL);
     clear_remote_op_timers(op);
+    undo_op_remap(op);
 
     if (op->notify_sent == TRUE) {
         crm_err("Already sent notifications for '%s of %s by %s' (for=%s@%s.%.8s, state=%d): %s",
@@ -279,10 +498,12 @@ remote_op_done(remote_fencing_op_t * op, xmlNode * data, int rc, int dup)
         goto remote_op_done_cleanup;
     }
 
-    if (!op->delegate && data) {
+    if (!op->delegate && data && rc != -ENODEV && rc != -EHOSTUNREACH) {
         xmlNode *ndata = get_xpath_object("//@" F_STONITH_DELEGATE, data, LOG_TRACE);
         if(ndata) {
             op->delegate = crm_element_value_copy(ndata, F_STONITH_DELEGATE);
+        } else { 
+            op->delegate = crm_element_value_copy(data, F_ORIG);
         }
     }
 
@@ -377,6 +598,16 @@ remote_op_timeout(gpointer userdata)
 
     crm_debug("Action %s (%s) for %s (%s) timed out",
               op->action, op->id, op->target, op->client_name);
+
+    if (op->phase == st_phase_on) {
+        /* A remapped reboot operation timed out in the "on" phase, but the
+         * "off" phase completed successfully, so quit trying any further
+         * devices, and return success.
+         */
+        remote_op_done(op, NULL, pcmk_ok, FALSE);
+        return FALSE;
+    }
+
     op->state = st_failed;
 
     remote_op_done(op, NULL, -ETIME, FALSE);
@@ -426,22 +657,43 @@ topology_is_empty(stonith_topology_t *tp)
     return TRUE;
 }
 
+/*
+ * \internal
+ * \brief Add a device to the required list for a particular phase
+ *
+ * \param[in,out] op      Operation to modify
+ * \param[in]     phase   Phase to modify
+ * \param[in]     device  Device ID to add
+ */
 static void
-add_required_device(remote_fencing_op_t * op, const char *device)
+add_required_device(remote_fencing_op_t *op, enum st_remap_phase phase,
+                    const char *device)
 {
-    GListPtr match  = g_list_find_custom(op->required_list, device, sort_strings);
-    if (match) {
-        /* device already marked required */
-        return;
+    GListPtr match  = g_list_find_custom(op->required_list[phase], device,
+                                         sort_strings);
+
+    if (!match) {
+        op->required_list[phase] = g_list_prepend(op->required_list[phase],
+                                                  strdup(device));
     }
-    op->required_list = g_list_prepend(op->required_list, strdup(device));
+}
 
-    /* make sure the required devices is in the current list of devices to be executed */
-    if (op->devices_list) {
-        GListPtr match  = g_list_find_custom(op->devices_list, device, sort_strings);
-        if (match == NULL) {
-           op->devices_list = g_list_append(op->devices_list, strdup(device));
-        }
+/*
+ * \internal
+ * \brief Remove a device from the required list for the current phase
+ *
+ * \param[in,out] op      Operation to modify
+ * \param[in]     device  Device ID to remove
+ */
+static void
+remove_required_device(remote_fencing_op_t *op, const char *device)
+{
+    GListPtr match = g_list_find_custom(op->required_list[op->phase], device,
+                                        sort_strings);
+
+    if (match) {
+        op->required_list[op->phase] = g_list_remove(op->required_list[op->phase],
+                                                     match->data);
     }
 }
 
@@ -458,18 +710,6 @@ set_op_device_list(remote_fencing_op_t * op, GListPtr devices)
     for (lpc = devices; lpc != NULL; lpc = lpc->next) {
         op->devices_list = g_list_append(op->devices_list, strdup(lpc->data));
     }
-
-    /* tack on whatever required devices have not been executed
-     * to the end of the current devices list. This ensures that
-     * the required devices will get executed regardless of what topology
-     * level they exist at. */
-    for (lpc = op->required_list; lpc != NULL; lpc = lpc->next) {
-        GListPtr match  = g_list_find_custom(op->devices_list, lpc->data, sort_strings);
-        if (match == NULL) {
-           op->devices_list = g_list_append(op->devices_list, strdup(lpc->data));
-        }
-    }
-
     op->devices = op->devices_list;
 }
 
@@ -491,6 +731,7 @@ find_topology_for_host(const char *host)
                 crm_info("Bad regex '%s' for fencing level", tp->node);
             } else {
                 status = regexec(&r_patt, host, 0, NULL, 0);
+                regfree(&r_patt);
             }
 
             if (status == 0) {
@@ -529,6 +770,9 @@ stonith_topology_next(remote_fencing_op_t * op)
 
     set_bit(op->call_options, st_opt_topology);
 
+    /* This is a new level, so undo any remapping left over from previous */
+    undo_op_remap(op);
+
     do {
         op->level++;
 
@@ -539,6 +783,15 @@ stonith_topology_next(remote_fencing_op_t * op)
                   op->level, op->target, g_list_length(tp->levels[op->level]),
                   op->client_name, op->originator, op->id);
         set_op_device_list(op, tp->levels[op->level]);
+
+        if (g_list_next(op->devices_list) && safe_str_eq(op->action, "reboot")) {
+            /* A reboot has been requested for a topology level with multiple
+             * devices. Instead of rebooting the devices sequentially, we will
+             * turn them all off, then turn them all on again. (Think about
+             * switched power outlets for redundant power supplies.)
+             */
+            op_phase_off(op);
+        }
         return pcmk_ok;
     }
 
@@ -563,6 +816,7 @@ merge_duplicates(remote_fencing_op_t * op)
     g_hash_table_iter_init(&iter, remote_op_list);
     while (g_hash_table_iter_next(&iter, NULL, (void **)&other)) {
         crm_node_t *peer = NULL;
+        const char *other_action = op_requested_action(other);
 
         if (other->state > st_exec) {
             /* Must be in-progress */
@@ -570,8 +824,9 @@ merge_duplicates(remote_fencing_op_t * op)
         } else if (safe_str_neq(op->target, other->target)) {
             /* Must be for the same node */
             continue;
-        } else if (safe_str_neq(op->action, other->action)) {
-            crm_trace("Must be for the same action: %s vs. ", op->action, other->action);
+        } else if (safe_str_neq(op->action, other_action)) {
+            crm_trace("Must be for the same action: %s vs. %s",
+                      op->action, other_action);
             continue;
         } else if (safe_str_eq(op->client_name, other->client_name)) {
             crm_trace("Must be for different clients: %s", op->client_name);
@@ -602,7 +857,7 @@ merge_duplicates(remote_fencing_op_t * op)
         if (other->total_timeout == 0) {
             crm_trace("Making a best-guess as to the timeout used");
             other->total_timeout = op->total_timeout =
-                TIMEOUT_MULTIPLY_FACTOR * get_op_total_timeout(op, NULL, op->base_timeout);
+                TIMEOUT_MULTIPLY_FACTOR * get_op_total_timeout(op, NULL);
         }
         crm_notice
             ("Merging stonith action %s for node %s originating from client %s.%.8s with identical request from %s@%s.%.8s (%ds)",
@@ -792,16 +1047,16 @@ initiate_remote_stonith_op(crm_client_t * client, xmlNode * request, gboolean ma
                        op->id, op->state);
     }
 
-    query = stonith_create_op(op->client_callid, op->id, STONITH_OP_QUERY, NULL, 0);
+    query = stonith_create_op(op->client_callid, op->id, STONITH_OP_QUERY,
+                              NULL, op->call_options);
 
     crm_xml_add(query, F_STONITH_REMOTE_OP_ID, op->id);
     crm_xml_add(query, F_STONITH_TARGET, op->target);
-    crm_xml_add(query, F_STONITH_ACTION, op->action);
+    crm_xml_add(query, F_STONITH_ACTION, op_requested_action(op));
     crm_xml_add(query, F_STONITH_ORIGIN, op->originator);
     crm_xml_add(query, F_STONITH_CLIENTID, op->client_id);
     crm_xml_add(query, F_STONITH_CLIENTNAME, op->client_name);
     crm_xml_add_int(query, F_STONITH_TIMEOUT, op->base_timeout);
-    crm_xml_add_int(query, F_STONITH_CALLOPTS, op->call_options);
 
     send_cluster_message(NULL, crm_msg_stonith_ng, query, FALSE);
     free_xml(query);
@@ -835,7 +1090,7 @@ find_best_peer(const char *device, remote_fencing_op_t * op, enum find_best_peer
         st_query_result_t *peer = iter->data;
 
         crm_trace("Testing result from %s for %s with %d devices: %d %x",
-                  peer->host, op->target, peer->devices, peer->tried, options);
+                  peer->host, op->target, peer->ndevices, peer->tried, options);
         if ((options & FIND_PEER_SKIP_TARGET) && safe_str_eq(peer->host, op->target)) {
             continue;
         }
@@ -844,25 +1099,13 @@ find_best_peer(const char *device, remote_fencing_op_t * op, enum find_best_peer
         }
 
         if (is_set(op->call_options, st_opt_topology)) {
-            /* Do they have the next device of the current fencing level? */
-            GListPtr match = NULL;
-
-            if (verified_devices_only && !g_hash_table_lookup(peer->verified_devices, device)) {
-                continue;
-            }
 
-            match = g_list_find_custom(peer->device_list, device, sort_strings);
-            if (match) {
-                crm_trace("Removing %s from %s (%d remaining)", (char *)match->data, peer->host,
-                          g_list_length(peer->device_list));
-                peer->device_list = g_list_remove(peer->device_list, match->data);
+            if (grab_peer_device(op, peer, device, verified_devices_only)) {
                 return peer;
             }
 
-        } else if (peer->devices > 0 && peer->tried == FALSE) {
-            if (verified_devices_only && !g_hash_table_size(peer->verified_devices)) {
-                continue;
-            }
+        } else if ((peer->tried == FALSE)
+                   && count_peer_devices(op, peer, verified_devices_only)) {
 
             /* No topology: Use the current best peer */
             crm_trace("Simple fencing");
@@ -883,11 +1126,14 @@ stonith_choose_peer(remote_fencing_op_t * op)
     do {
         if (op->devices) {
             device = op->devices->data;
-            crm_trace("Checking for someone to fence %s with %s", op->target, device);
+            crm_trace("Checking for someone to fence (%s) %s with %s",
+                      op->action, op->target, device);
         } else {
-            crm_trace("Checking for someone to fence %s", op->target);
+            crm_trace("Checking for someone to fence (%s) %s",
+                      op->action, op->target);
         }
 
+        /* Best choice is a peer other than the target with verified access */
         peer = find_best_peer(device, op, FIND_PEER_SKIP_TARGET|FIND_PEER_VERIFIED_ONLY);
         if (peer) {
             crm_trace("Found verified peer %s for %s", peer->host, device?device:"<any>");
@@ -899,62 +1145,101 @@ stonith_choose_peer(remote_fencing_op_t * op)
             return NULL;
         }
 
+        /* If no other peer has verified access, next best is unverified access */
         peer = find_best_peer(device, op, FIND_PEER_SKIP_TARGET);
         if (peer) {
             crm_trace("Found best unverified peer %s", peer->host);
             return peer;
         }
 
-        peer = find_best_peer(device, op, FIND_PEER_TARGET_ONLY);
-        if(peer) {
-            crm_trace("%s will fence itself", peer->host);
-            return peer;
+        /* If no other peer can do it, last option is self-fencing
+         * (which is never allowed for the "on" phase of a remapped reboot)
+         */
+        if (op->phase != st_phase_on) {
+            peer = find_best_peer(device, op, FIND_PEER_TARGET_ONLY);
+            if (peer) {
+                crm_trace("%s will fence itself", peer->host);
+                return peer;
+            }
         }
 
-        /* Try the next fencing level if there is one */
-    } while (is_set(op->call_options, st_opt_topology)
+        /* Try the next fencing level if there is one (unless we're in the "on"
+         * phase of a remapped "reboot", because we ignore errors in that case)
+         */
+    } while ((op->phase != st_phase_on)
+             && is_set(op->call_options, st_opt_topology)
              && stonith_topology_next(op) == pcmk_ok);
 
-    crm_notice("Couldn't find anyone to fence %s with %s", op->target, device?device:"<any>");
+    crm_notice("Couldn't find anyone to fence (%s) %s with %s",
+               op->action, op->target, (device? device : "any device"));
     return NULL;
 }
 
 static int
-get_device_timeout(st_query_result_t * peer, const char *device, int default_timeout)
+get_device_timeout(const remote_fencing_op_t *op, const st_query_result_t *peer,
+                   const char *device)
 {
-    gpointer res;
-    int delay_max = 0;
+    device_properties_t *props;
 
     if (!peer || !device) {
-        return default_timeout;
+        return op->base_timeout;
     }
 
-    res = g_hash_table_lookup(peer->delay_maxes, device);
-    if (res && GPOINTER_TO_INT(res) > 0) {
-        delay_max = GPOINTER_TO_INT(res);
+    props = g_hash_table_lookup(peer->devices, device);
+    if (!props) {
+        return op->base_timeout;
     }
 
-    res = g_hash_table_lookup(peer->custom_action_timeouts, device);
+    return (props->custom_action_timeout[op->phase]?
+           props->custom_action_timeout[op->phase] : op->base_timeout)
+           + props->delay_max[op->phase];
+}
 
-    return res ? GPOINTER_TO_INT(res) + delay_max : default_timeout + delay_max;
+struct timeout_data {
+    const remote_fencing_op_t *op;
+    const st_query_result_t *peer;
+    int total_timeout;
+};
+
+/*!
+ * \internal
+ * \brief Add timeout to a total if device has not been executed yet
+ *
+ * \param[in] key        GHashTable key (device ID)
+ * \param[in] value      GHashTable value (device properties)
+ * \param[in] user_data  Timeout data
+ */
+static void
+add_device_timeout(gpointer key, gpointer value, gpointer user_data)
+{
+    const char *device_id = key;
+    device_properties_t *props = value;
+    struct timeout_data *timeout = user_data;
+
+    if (!props->executed[timeout->op->phase]
+        && !props->disallowed[timeout->op->phase]) {
+        timeout->total_timeout += get_device_timeout(timeout->op,
+                                                     timeout->peer, device_id);
+    }
 }
 
 static int
-get_peer_timeout(st_query_result_t * peer, int default_timeout)
+get_peer_timeout(const remote_fencing_op_t *op, const st_query_result_t *peer)
 {
-    int total_timeout = 0;
+    struct timeout_data timeout;
 
-    GListPtr cur = NULL;
+    timeout.op = op;
+    timeout.peer = peer;
+    timeout.total_timeout = 0;
 
-    for (cur = peer->device_list; cur; cur = cur->next) {
-        total_timeout += get_device_timeout(peer, cur->data, default_timeout);
-    }
+    g_hash_table_foreach(peer->devices, add_device_timeout, &timeout);
 
-    return total_timeout ? total_timeout : default_timeout;
+    return (timeout.total_timeout? timeout.total_timeout : op->base_timeout);
 }
 
 static int
-get_op_total_timeout(remote_fencing_op_t * op, st_query_result_t * chosen_peer, int default_timeout)
+get_op_total_timeout(const remote_fencing_op_t *op,
+                     const st_query_result_t *chosen_peer)
 {
     int total_timeout = 0;
     stonith_topology_t *tp = find_topology_for_host(op->target);
@@ -977,11 +1262,11 @@ get_op_total_timeout(remote_fencing_op_t * op, st_query_result_t * chosen_peer,
             }
             for (device_list = tp->levels[i]; device_list; device_list = device_list->next) {
                 for (iter = op->query_results; iter != NULL; iter = iter->next) {
-                    st_query_result_t *peer = iter->data;
+                    const st_query_result_t *peer = iter->data;
 
-                    if (g_list_find_custom(peer->device_list, device_list->data, sort_strings)) {
-                        total_timeout +=
-                            get_device_timeout(peer, device_list->data, default_timeout);
+                    if (find_peer_device(op, peer, device_list->data)) {
+                        total_timeout += get_device_timeout(op, peer,
+                                                            device_list->data);
                         break;
                     }
                 }               /* End Loop3: match device with peer that owns device, find device's timeout period */
@@ -989,12 +1274,12 @@ get_op_total_timeout(remote_fencing_op_t * op, st_query_result_t * chosen_peer,
         }                       /*End Loop1: iterate through fencing levels */
 
     } else if (chosen_peer) {
-        total_timeout = get_peer_timeout(chosen_peer, default_timeout);
+        total_timeout = get_peer_timeout(op, chosen_peer);
     } else {
-        total_timeout = default_timeout;
+        total_timeout = op->base_timeout;
     }
 
-    return total_timeout ? total_timeout : default_timeout;
+    return total_timeout ? total_timeout : op->base_timeout;
 }
 
 static void
@@ -1049,6 +1334,55 @@ report_timeout_period(remote_fencing_op_t * op, int op_timeout)
     }
 }
 
+/*
+ * \internal
+ * \brief Advance an operation to the next device in its topology
+ *
+ * \param[in,out] op      Operation to advance
+ * \param[in]     device  ID of device just completed
+ * \param[in]     msg     XML reply that contained device result (if available)
+ * \param[in]     rc      Return code of device's execution
+ */
+static void
+advance_op_topology(remote_fencing_op_t *op, const char *device, xmlNode *msg,
+                    int rc)
+{
+    /* Advance to the next device at this topology level, if any */
+    if (op->devices) {
+        op->devices = op->devices->next;
+    }
+
+    /* If this device was required, it's not anymore */
+    remove_required_device(op, device);
+
+    /* If there are no more devices at this topology level,
+     * run through any required devices not already executed
+     */
+    if (op->devices == NULL) {
+        op->devices = op->required_list[op->phase];
+    }
+
+    if ((op->devices == NULL) && (op->phase == st_phase_off)) {
+        /* We're done with this level and with required devices, but we had
+         * remapped "reboot" to "off", so start over with "on". If any devices
+         * need to be turned back on, op->devices will be non-NULL after this.
+         */
+        op_phase_on(op);
+    }
+
+    if (op->devices) {
+        /* Necessary devices remain, so execute the next one */
+        crm_trace("Next for %s on behalf of %s@%s (rc was %d)",
+                  op->target, op->originator, op->client_name, rc);
+        call_remote_stonith(op, NULL);
+    } else {
+        /* We're done with all devices and phases, so finalize operation */
+        crm_trace("Marking complex fencing op for %s as complete", op->target);
+        op->state = st_done;
+        remote_op_done(op, msg, rc, FALSE);
+    }
+}
+
 void
 call_remote_stonith(remote_fencing_op_t * op, st_query_result_t * peer)
 {
@@ -1061,7 +1395,7 @@ call_remote_stonith(remote_fencing_op_t * op, st_query_result_t * peer)
     }
 
     if (!op->op_timer_total) {
-        int total_timeout = get_op_total_timeout(op, peer, op->base_timeout);
+        int total_timeout = get_op_total_timeout(op, peer);
 
         op->total_timeout = TIMEOUT_MULTIPLY_FACTOR * total_timeout;
         op->op_timer_total = g_timeout_add(1000 * op->total_timeout, remote_op_timeout, op);
@@ -1071,13 +1405,13 @@ call_remote_stonith(remote_fencing_op_t * op, st_query_result_t * peer)
     }
 
     if (is_set(op->call_options, st_opt_topology) && op->devices) {
-        /* Ignore any preference, they might not have the device we need */
-        /* When using topology, the stonith_choose_peer function pops off
-         * the peer from the op's query results.  Make sure to calculate
-         * the op_timeout before calling this function when topology is in use */
+        /* Ignore any peer preference, they might not have the device we need */
+        /* When using topology, stonith_choose_peer() removes the device from
+         * further consideration, so be sure to calculate timeout beforehand */
         peer = stonith_choose_peer(op);
+
         device = op->devices->data;
-        timeout = get_device_timeout(peer, device, op->base_timeout);
+        timeout = get_device_timeout(op, peer, device);
     }
 
     if (peer) {
@@ -1094,15 +1428,15 @@ call_remote_stonith(remote_fencing_op_t * op, st_query_result_t * peer)
         crm_xml_add_int(remote_op, F_STONITH_CALLOPTS, op->call_options);
 
         if (device) {
-            timeout_one =
-                TIMEOUT_MULTIPLY_FACTOR * get_device_timeout(peer, device, op->base_timeout);
+            timeout_one = TIMEOUT_MULTIPLY_FACTOR *
+                          get_device_timeout(op, peer, device);
             crm_info("Requesting that %s perform op %s %s with %s for %s (%ds)", peer->host,
                      op->action, op->target, device, op->client_name, timeout_one);
             crm_xml_add(remote_op, F_STONITH_DEVICE, device);
             crm_xml_add(remote_op, F_STONITH_MODE, "slave");
 
         } else {
-            timeout_one = TIMEOUT_MULTIPLY_FACTOR * get_peer_timeout(peer, op->base_timeout);
+            timeout_one = TIMEOUT_MULTIPLY_FACTOR * get_peer_timeout(op, peer);
             crm_info("Requesting that %s perform op %s %s for %s (%ds, %ds)",
                      peer->host, op->action, op->target, op->client_name, timeout_one, stonith_watchdog_timeout_ms);
             crm_xml_add(remote_op, F_STONITH_MODE, "smart");
@@ -1115,16 +1449,18 @@ call_remote_stonith(remote_fencing_op_t * op, st_query_result_t * peer)
         }
 
         if(stonith_watchdog_timeout_ms > 0 && device && safe_str_eq(device, "watchdog")) {
-            crm_notice("Waiting %ds for %s to self-terminate for %s.%.8s (%p)",
-                       stonith_watchdog_timeout_ms/1000, op->target, op->client_name, op->id, device);
+            crm_notice("Waiting %ds for %s to self-fence (%s) for %s.%.8s (%p)",
+                       stonith_watchdog_timeout_ms/1000, op->target,
+                       op->action, op->client_name, op->id, device);
             op->op_timer_one = g_timeout_add(stonith_watchdog_timeout_ms, remote_op_watchdog_done, op);
 
-            /* TODO: We should probably look into peer->device_list to verify watchdog is going to be in use */
+            /* TODO check devices to verify watchdog will be in use */
         } else if(stonith_watchdog_timeout_ms > 0
                   && safe_str_eq(peer->host, op->target)
                   && safe_str_neq(op->action, "on")) {
-            crm_notice("Waiting %ds for %s to self-terminate for %s.%.8s (%p)",
-                       stonith_watchdog_timeout_ms/1000, op->target, op->client_name, op->id, device);
+            crm_notice("Waiting %ds for %s to self-fence (%s) for %s.%.8s (%p)",
+                       stonith_watchdog_timeout_ms/1000, op->target,
+                       op->action, op->client_name, op->id, device);
             op->op_timer_one = g_timeout_add(stonith_watchdog_timeout_ms, remote_op_watchdog_done, op);
 
         } else {
@@ -1137,13 +1473,23 @@ call_remote_stonith(remote_fencing_op_t * op, st_query_result_t * peer)
         free_xml(remote_op);
         return;
 
+    } else if (op->phase == st_phase_on) {
+        /* A remapped "on" cannot be executed, but the node was already
+         * turned off successfully, so ignore the error and continue.
+         */
+        crm_warn("Ignoring %s 'on' failure (no capable peers) for %s after successful 'off'",
+                 device, op->target);
+        advance_op_topology(op, device, NULL, pcmk_ok);
+        return;
+
     } else if (op->owner == FALSE) {
-        crm_err("The termination of %s for %s is not ours to control", op->target, op->client_name);
+        crm_err("Fencing (%s) of %s for %s is not ours to control",
+                op->action, op->target, op->client_name);
 
     } else if (op->query_timer == 0) {
         /* We've exhausted all available peers */
-        crm_info("No remaining peers capable of terminating %s for %s (%d)", op->target,
-                 op->client_name, op->state);
+        crm_info("No remaining peers capable of fencing (%s) %s for %s (%d)",
+                 op->target, op->action, op->client_name, op->state);
         CRM_LOG_ASSERT(op->state < st_done);
         remote_op_timeout(op);
 
@@ -1153,33 +1499,37 @@ call_remote_stonith(remote_fencing_op_t * op, st_query_result_t * peer)
         /* if the operation never left the query state,
          * but we have all the expected replies, then no devices
          * are available to execute the fencing operation. */
+
         if(stonith_watchdog_timeout_ms && (device == NULL || safe_str_eq(device, "watchdog"))) {
-            crm_notice("Waiting %ds for %s to self-terminate for %s.%.8s (%p)",
-                     stonith_watchdog_timeout_ms/1000, op->target, op->client_name, op->id, device);
+            crm_notice("Waiting %ds for %s to self-fence (%s) for %s.%.8s (%p)",
+                     stonith_watchdog_timeout_ms/1000, op->target,
+                     op->action, op->client_name, op->id, device);
 
             op->op_timer_one = g_timeout_add(stonith_watchdog_timeout_ms, remote_op_watchdog_done, op);
             return;
         }
 
         if (op->state == st_query) {
-           crm_info("None of the %d peers have devices capable of terminating %s for %s (%d)",
-                   op->replies, op->target, op->client_name, op->state);
+           crm_info("None of the %d peers have devices capable of fencing (%s) %s for %s (%d)",
+                   op->replies, op->action, op->target, op->client_name,
+                   op->state);
 
             rc = -ENODEV;
         } else {
-           crm_info("None of the %d peers are capable of terminating %s for %s (%d)",
-                   op->replies, op->target, op->client_name, op->state);
+           crm_info("None of the %d peers are capable of fencing (%s) %s for %s (%d)",
+                   op->replies, op->action, op->target, op->client_name,
+                   op->state);
         }
 
         op->state = st_failed;
         remote_op_done(op, NULL, rc, FALSE);
 
     } else if (device) {
-        crm_info("Waiting for additional peers capable of terminating %s with %s for %s.%.8s",
-                 op->target, device, op->client_name, op->id);
+        crm_info("Waiting for additional peers capable of fencing (%s) %s with %s for %s.%.8s",
+                 op->action, op->target, device, op->client_name, op->id);
     } else {
-        crm_info("Waiting for additional peers capable of terminating %s for %s%.8s",
-                 op->target, op->client_name, op->id);
+        crm_info("Waiting for additional peers capable of fencing (%s) %s for %s%.8s",
+                 op->action, op->target, op->client_name, op->id);
     }
 }
 
@@ -1200,7 +1550,7 @@ sort_peers(gconstpointer a, gconstpointer b)
     const st_query_result_t *peer_a = a;
     const st_query_result_t *peer_b = b;
 
-    return (peer_b->devices - peer_a->devices);
+    return (peer_b->ndevices - peer_a->ndevices);
 }
 
 /*!
@@ -1212,7 +1562,7 @@ all_topology_devices_found(remote_fencing_op_t * op)
 {
     GListPtr device = NULL;
     GListPtr iter = NULL;
-    GListPtr match = NULL;
+    device_properties_t *match = NULL;
     stonith_topology_t *tp = NULL;
     gboolean skip_target = FALSE;
     int i;
@@ -1236,7 +1586,7 @@ all_topology_devices_found(remote_fencing_op_t * op)
                 if (skip_target && safe_str_eq(peer->host, op->target)) {
                     continue;
                 }
-                match = g_list_find_custom(peer->device_list, device->data, sort_strings);
+                match = find_peer_device(op, peer, device->data);
             }
             if (!match) {
                 return FALSE;
@@ -1247,10 +1597,169 @@ all_topology_devices_found(remote_fencing_op_t * op)
     return TRUE;
 }
 
+/*
+ * \internal
+ * \brief Parse action-specific device properties from XML
+ *
+ * \param[in]     msg     XML element containing the properties
+ * \param[in]     peer    Name of peer that sent XML (for logs)
+ * \param[in]     device  Device ID (for logs)
+ * \param[in]     action  Action the properties relate to (for logs)
+ * \param[in]     phase   Phase the properties relate to
+ * \param[in,out] props   Device properties to update
+ */
+static void
+parse_action_specific(xmlNode *xml, const char *peer, const char *device,
+                      const char *action, remote_fencing_op_t *op,
+                      enum st_remap_phase phase, device_properties_t *props)
+{
+    int required;
+
+    props->custom_action_timeout[phase] = 0;
+    crm_element_value_int(xml, F_STONITH_ACTION_TIMEOUT,
+                          &props->custom_action_timeout[phase]);
+    if (props->custom_action_timeout[phase]) {
+        crm_trace("Peer %s with device %s returned %s action timeout %d",
+                  peer, device, action, props->custom_action_timeout[phase]);
+    }
+
+    props->delay_max[phase] = 0;
+    crm_element_value_int(xml, F_STONITH_DELAY_MAX, &props->delay_max[phase]);
+    if (props->delay_max[phase]) {
+        crm_trace("Peer %s with device %s returned maximum of random delay %d for %s",
+                  peer, device, props->delay_max[phase], action);
+    }
+
+    required = 0;
+    crm_element_value_int(xml, F_STONITH_DEVICE_REQUIRED, &required);
+    if (required) {
+        /* If the action is marked as required, add the device to the
+         * operation's list of required devices for this phase. We use this
+         * for unfencing when executing a topology. In phase 0 (requested
+         * action) or phase 1 (remapped "off"), required devices get executed
+         * regardless of their topology level; in phase 2 (remapped "on"),
+         * required devices are not attempted, because the cluster will
+         * execute them automatically later.
+         */
+        crm_trace("Peer %s requires device %s to execute for action %s",
+                  peer, device, action);
+        add_required_device(op, phase, device);
+    }
+
+    /* If a reboot is remapped to off+on, it's possible that a node is allowed
+     * to perform one action but not another.
+     */
+    if (crm_is_true(crm_element_value(xml, F_STONITH_ACTION_DISALLOWED))) {
+        props->disallowed[phase] = TRUE;
+        crm_trace("Peer %s is disallowed from executing %s for device %s",
+                  peer, action, device);
+    }
+}
+
+/*
+ * \internal
+ * \brief Parse one device's properties from peer's XML query reply
+ *
+ * \param[in]     xml       XML node containing device properties
+ * \param[in,out] op        Operation that query and reply relate to
+ * \param[in,out] result    Peer's results
+ * \param[in]     device    ID of device being parsed
+ */
+static void
+add_device_properties(xmlNode *xml, remote_fencing_op_t *op,
+                      st_query_result_t *result, const char *device)
+{
+    xmlNode *child;
+    int verified = 0;
+    device_properties_t *props = calloc(1, sizeof(device_properties_t));
+
+    /* Add a new entry to this result's devices list */
+    CRM_ASSERT(props != NULL);
+    g_hash_table_insert(result->devices, strdup(device), props);
+
+    /* Peers with verified (monitored) access will be preferred */
+    crm_element_value_int(xml, F_STONITH_DEVICE_VERIFIED, &verified);
+    if (verified) {
+        crm_trace("Peer %s has confirmed a verified device %s",
+                  result->host, device);
+        props->verified = TRUE;
+    }
+
+    /* Parse action-specific device properties */
+    parse_action_specific(xml, result->host, device, op_requested_action(op),
+                          op, st_phase_requested, props);
+    for (child = __xml_first_child(xml); child != NULL; child = __xml_next(child)) {
+        /* Replies for "reboot" operations will include the action-specific
+         * values for "off" and "on" in child elements, just in case the reboot
+         * winds up getting remapped.
+         */
+        if (safe_str_eq(ID(child), "off")) {
+            parse_action_specific(child, result->host, device, "off",
+                                  op, st_phase_off, props);
+        } else if (safe_str_eq(ID(child), "on")) {
+            parse_action_specific(child, result->host, device, "on",
+                                  op, st_phase_on, props);
+        }
+    }
+}
+
+/*
+ * \internal
+ * \brief Parse a peer's XML query reply and add it to operation's results
+ *
+ * \param[in,out] op        Operation that query and reply relate to
+ * \param[in]     host      Name of peer that sent this reply
+ * \param[in]     ndevices  Number of devices expected in reply
+ * \param[in]     xml       XML node containing device list
+ *
+ * \return Newly allocated result structure with parsed reply
+ */
+static st_query_result_t *
+add_result(remote_fencing_op_t *op, const char *host, int ndevices, xmlNode *xml)
+{
+    st_query_result_t *result = calloc(1, sizeof(st_query_result_t));
+    xmlNode *child;
+
+    CRM_CHECK(result != NULL, return NULL);
+    result->host = strdup(host);
+    result->devices = g_hash_table_new_full(crm_str_hash, g_str_equal, free, free);
+
+    /* Each child element describes one capable device available to the peer */
+    for (child = __xml_first_child(xml); child != NULL; child = __xml_next(child)) {
+        const char *device = ID(child);
+
+        if (device) {
+            add_device_properties(child, op, result, device);
+        }
+    }
+
+    result->ndevices = g_hash_table_size(result->devices);
+    CRM_CHECK(ndevices == result->ndevices,
+              crm_err("Query claimed to have %d devices but %d found",
+                      ndevices, result->ndevices));
+
+    op->query_results = g_list_insert_sorted(op->query_results, result, sort_peers);
+    return result;
+}
+
+/*
+ * \internal
+ * \brief Handle a peer's reply to our fencing query
+ *
+ * Parse a query result from XML and store it in the remote operation
+ * table, and when enough replies have been received, issue a fencing request.
+ *
+ * \param[in] msg  XML reply received
+ *
+ * \return pcmk_ok on success, -errno on error
+ *
+ * \note See initiate_remote_stonith_op() for how the XML query was initially
+ *       formed, and stonith_query() for how the peer formed its XML reply.
+ */
 int
 process_remote_stonith_query(xmlNode * msg)
 {
-    int devices = 0;
+    int ndevices = 0;
     gboolean host_is_target = FALSE;
     gboolean have_all_replies = FALSE;
     const char *id = NULL;
@@ -1259,7 +1768,6 @@ process_remote_stonith_query(xmlNode * msg)
     st_query_result_t *result = NULL;
     uint32_t replies_expected;
     xmlNode *dev = get_xpath_object("//@" F_STONITH_REMOTE_OP_ID, msg, LOG_ERR);
-    xmlNode *child = NULL;
 
     CRM_CHECK(dev != NULL, return -EPROTO);
 
@@ -1268,7 +1776,7 @@ process_remote_stonith_query(xmlNode * msg)
 
     dev = get_xpath_object("//@" F_STONITH_AVAILABLE_DEVICES, msg, LOG_ERR);
     CRM_CHECK(dev != NULL, return -EPROTO);
-    crm_element_value_int(dev, F_STONITH_AVAILABLE_DEVICES, &devices);
+    crm_element_value_int(dev, F_STONITH_AVAILABLE_DEVICES, &ndevices);
 
     op = g_hash_table_lookup(remote_op_list, id);
     if (op == NULL) {
@@ -1283,75 +1791,13 @@ process_remote_stonith_query(xmlNode * msg)
     host = crm_element_value(msg, F_ORIG);
     host_is_target = safe_str_eq(host, op->target);
 
-    if (devices <= 0) {
-        /* If we're doing 'known' then we might need to fire anyway */
-        crm_trace("Query result %d of %d from %s for %s/%s (%d devices) %s",
-                  op->replies, replies_expected, host,
-                  op->target, op->action, devices, id);
-        if (have_all_replies) {
-            crm_info("All query replies have arrived, continuing (%d expected/%d received for id %s)",
-                     replies_expected, op->replies, id);
-            call_remote_stonith(op, NULL);
-        }
-        return pcmk_ok;
-    }
-
     crm_info("Query result %d of %d from %s for %s/%s (%d devices) %s",
              op->replies, replies_expected, host,
-             op->target, op->action, devices, id);
-    result = calloc(1, sizeof(st_query_result_t));
-    result->host = strdup(host);
-    result->devices = devices;
-    result->custom_action_timeouts = g_hash_table_new_full(crm_str_hash, g_str_equal, free, NULL);
-    result->delay_maxes = g_hash_table_new_full(crm_str_hash, g_str_equal, free, NULL);
-    result->verified_devices = g_hash_table_new_full(crm_str_hash, g_str_equal, free, NULL);
-
-    for (child = __xml_first_child(dev); child != NULL; child = __xml_next(child)) {
-        const char *device = ID(child);
-        int action_timeout = 0;
-        int delay_max = 0;
-        int verified = 0;
-        int required = 0;
-
-        if (device) {
-            result->device_list = g_list_prepend(result->device_list, strdup(device));
-            crm_element_value_int(child, F_STONITH_ACTION_TIMEOUT, &action_timeout);
-            crm_element_value_int(child, F_STONITH_DELAY_MAX, &delay_max);
-            crm_element_value_int(child, F_STONITH_DEVICE_VERIFIED, &verified);
-            crm_element_value_int(child, F_STONITH_DEVICE_REQUIRED, &required);
-            if (action_timeout) {
-                crm_trace("Peer %s with device %s returned action timeout %d",
-                          result->host, device, action_timeout);
-                g_hash_table_insert(result->custom_action_timeouts,
-                                    strdup(device), GINT_TO_POINTER(action_timeout));
-            }
-            if (delay_max > 0) {
-                crm_trace("Peer %s with device %s returned maximum of random delay %d",
-                          result->host, device, delay_max);
-                g_hash_table_insert(result->delay_maxes,
-                                    strdup(device), GINT_TO_POINTER(delay_max));
-            }
-            if (verified) {
-                crm_trace("Peer %s has confirmed a verified device %s", result->host, device);
-                g_hash_table_insert(result->verified_devices,
-                                    strdup(device), GINT_TO_POINTER(verified));
-            }
-            if (required) {
-                crm_trace("Peer %s requires device %s to execute for action %s",
-                          result->host, device, op->action);
-                /* This matters when executing a topology. Required devices will get 
-                 * executed regardless of their topology level. We use this for unfencing. */
-                add_required_device(op, device);
-            }
-        }
+             op->target, op->action, ndevices, id);
+    if (ndevices > 0) {
+        result = add_result(op, host, ndevices, dev);
     }
 
-    CRM_CHECK(devices == g_list_length(result->device_list),
-              crm_err("Mis-match: Query claimed to have %d devices but %d found", devices,
-                      g_list_length(result->device_list)));
-
-    op->query_results = g_list_insert_sorted(op->query_results, result, sort_peers);
-
     if (is_set(op->call_options, st_opt_topology)) {
         /* If we start the fencing before all the topology results are in,
          * it is possible fencing levels will be skipped because of the missing
@@ -1368,11 +1814,13 @@ process_remote_stonith_query(xmlNode * msg)
         }
 
     } else if (op->state == st_query) {
+        int nverified = count_peer_devices(op, result, TRUE);
+
         /* We have a result for a non-topology fencing op that looks promising,
          * go ahead and start fencing before query timeout */
-        if (host_is_target == FALSE && g_hash_table_size(result->verified_devices)) {
+        if (result && (host_is_target == FALSE) && nverified) {
             /* we have a verified device living on a peer that is not the target */
-            crm_trace("Found %d verified devices", g_hash_table_size(result->verified_devices));
+            crm_trace("Found %d verified devices", nverified);
             call_remote_stonith(op, result);
 
         } else if (have_all_replies) {
@@ -1384,14 +1832,25 @@ process_remote_stonith_query(xmlNode * msg)
             crm_trace("Waiting for more peer results before launching fencing operation");
         }
 
-    } else if (op->state == st_done) {
+    } else if (result && (op->state == st_done)) {
         crm_info("Discarding query result from %s (%d devices): Operation is in state %d",
-                 result->host, result->devices, op->state);
+                 result->host, result->ndevices, op->state);
     }
 
     return pcmk_ok;
 }
 
+/*
+ * \internal
+ * \brief Handle a peer's reply to a fencing request
+ *
+ * Parse a fencing reply from XML, and either finalize the operation
+ * or attempt another device as appropriate.
+ *
+ * \param[in] msg  XML reply received
+ *
+ * \return pcmk_ok on success, -errno on error
+ */
 int
 process_remote_stonith_exec(xmlNode * msg)
 {
@@ -1472,26 +1931,20 @@ process_remote_stonith_exec(xmlNode * msg)
             return rc;
         }
 
-        /* An operation completed succesfully but has not yet been marked as done.
-         * Continue the topology if more devices exist at the current level, otherwise
-         * mark as done. */
+        if ((op->phase == 2) && (rc != pcmk_ok)) {
+            /* A remapped "on" failed, but the node was already turned off
+             * successfully, so ignore the error and continue.
+             */
+            crm_warn("Ignoring %s 'on' failure (exit code %d) for %s after successful 'off'",
+                     device, rc, op->target);
+            rc = pcmk_ok;
+        }
+
         if (rc == pcmk_ok) {
-            GListPtr required_match = g_list_find_custom(op->required_list, device, sort_strings);
-            if (op->devices) {
-                /* Success, are there any more? */
-                op->devices = op->devices->next;
-            }
-            if (required_match) {
-                op->required_list = g_list_remove(op->required_list, required_match->data);
-            }
-            /* if no more devices at this fencing level, we are done,
-             * else we need to contine with executing the next device in the list */
-            if (op->devices == NULL) {
-                crm_trace("Marking complex fencing op for %s as complete", op->target);
-                op->state = st_done;
-                remote_op_done(op, msg, rc, FALSE);
-                return rc;
-            }
+            /* An operation completed successfully. Try another device if
+             * necessary, otherwise mark the operation as done. */
+            advance_op_topology(op, device, msg, rc);
+            return rc;
         } else {
             /* This device failed, time to try another topology level. If no other
              * levels are available, mark this operation as failed and report results. */
@@ -1516,7 +1969,7 @@ process_remote_stonith_exec(xmlNode * msg)
         /* fall-through and attempt other fencing action using another peer */
     }
 
-    /* Retry on failure or execute the rest of the topology */
+    /* Retry on failure */
     crm_trace("Next for %s on behalf of %s@%s (rc was %d)", op->target, op->originator,
               op->client_name, rc);
     call_remote_stonith(op, NULL);
@@ -1595,6 +2048,9 @@ stonith_check_fence_tolerance(int tolerance, const char *target, const char *act
             continue;
         } else if (rop->state != st_done) {
             continue;
+        /* We don't have to worry about remapped reboots here
+         * because if state is done, any remapping has been undone
+         */
         } else if (strcmp(rop->action, action) != 0) {
             continue;
         } else if ((rop->completed + tolerance) < now) {
diff --git a/include/crm/fencing/internal.h b/include/crm/fencing/internal.h
index a6f58b1..a59151b 100644
--- a/include/crm/fencing/internal.h
+++ b/include/crm/fencing/internal.h
@@ -63,6 +63,8 @@ xmlNode *create_device_registration_xml(const char *id, const char *namespace, c
 #  define F_STONITH_TOLERANCE     "st_tolerance"
 /*! Action specific timeout period returned in query of fencing devices. */
 #  define F_STONITH_ACTION_TIMEOUT       "st_action_timeout"
+/*! Host in query result is not allowed to run this action */
+#  define F_STONITH_ACTION_DISALLOWED     "st_action_disallowed"
 /*! Maximum of random fencing delay for a device */
 #  define F_STONITH_DELAY_MAX            "st_delay_max"
 /*! Has this device been verified using a monitor type
diff --git a/include/crm/lrmd.h b/include/crm/lrmd.h
index e3a0d63..730cad3 100644
--- a/include/crm/lrmd.h
+++ b/include/crm/lrmd.h
@@ -200,8 +200,6 @@ typedef struct lrmd_event_data_s {
     enum ocf_exitcode rc;
     /*! The lrmd status returned for exec_complete events */
     int op_status;
-    /*! exit failure reason string from resource agent operation */
-    const char *exit_reason;
     /*! stdout from resource agent operation */
     const char *output;
     /*! Timestamp of when op ran */
@@ -226,6 +224,9 @@ typedef struct lrmd_event_data_s {
      * to the proper client. */
     const char *remote_nodename;
 
+    /*! exit failure reason string from resource agent operation */
+    const char *exit_reason;
+
 } lrmd_event_data_t;
 
 lrmd_event_data_t *lrmd_copy_event(lrmd_event_data_t * event);
diff --git a/include/crm/pengine/status.h b/include/crm/pengine/status.h
index 4bfa3fe..4214959 100644
--- a/include/crm/pengine/status.h
+++ b/include/crm/pengine/status.h
@@ -137,10 +137,6 @@ struct node_shared_s {
     gboolean shutdown;
     gboolean expected_up;
     gboolean is_dc;
-    gboolean rsc_discovery_enabled;
-
-    gboolean remote_requires_reset;
-    gboolean remote_was_fenced;
 
     int num_resources;
     GListPtr running_rsc;       /* resource_t* */
@@ -157,14 +153,17 @@ struct node_shared_s {
     GHashTable *digest_cache;
 
     gboolean maintenance;
+    gboolean rsc_discovery_enabled;
+    gboolean remote_requires_reset;
+    gboolean remote_was_fenced;
 };
 
 struct node_s {
     int weight;
     gboolean fixed;
-    int rsc_discover_mode;
     int count;
     struct node_shared_s *details;
+    int rsc_discover_mode;
 };
 
 #  include <crm/pengine/complex.h>
@@ -262,7 +261,6 @@ struct resource_s {
     int migration_threshold;
 
     gboolean is_remote_node;
-    gboolean exclusive_discover;
 
     unsigned long long flags;
 
@@ -296,6 +294,7 @@ struct resource_s {
     char *pending_task;
 
     const char *isolation_wrapper;
+    gboolean exclusive_discover;
 };
 
 struct pe_action_s {
diff --git a/lib/cib/cib_ops.c b/lib/cib/cib_ops.c
index 5f73559..8966ae2 100644
--- a/lib/cib/cib_ops.c
+++ b/lib/cib/cib_ops.c
@@ -373,7 +373,10 @@ cib_process_modify(const char *op, int options, const char *section, xmlNode * r
 
         for (lpc = 0; lpc < max; lpc++) {
             xmlNode *match = getXpathResult(xpathObj, lpc);
-            crm_debug("Destroying %s", (char *)xmlGetNodePath(match));
+            xmlChar *match_path = xmlGetNodePath(match);
+
+            crm_debug("Destroying %s", match_path);
+            free(match_path);
             free_xml(match);
         }
 
diff --git a/lib/cib/cib_utils.c b/lib/cib/cib_utils.c
index 28b8e81..d321517 100644
--- a/lib/cib/cib_utils.c
+++ b/lib/cib/cib_utils.c
@@ -533,7 +533,7 @@ cib_perform_op(const char *op, int call_options, cib_op_t * fn, gboolean is_quer
             int current_schema = get_schema_version(schema);
 
             if (minimum_schema == 0) {
-                minimum_schema = get_schema_version("pacemaker-1.1");
+                minimum_schema = get_schema_version("pacemaker-1.2");
             }
 
             /* Does the CIB support the "update-*" attributes... */
diff --git a/lib/cluster/membership.c b/lib/cluster/membership.c
index 28f41cb..b7958eb 100644
--- a/lib/cluster/membership.c
+++ b/lib/cluster/membership.c
@@ -734,6 +734,14 @@ crm_update_peer_proc(const char *source, crm_node_t * node, uint32_t flag, const
         if (crm_status_callback) {
             crm_status_callback(crm_status_processes, node, &last);
         }
+
+        /* The client callback shouldn't touch the peer caches,
+         * but as a safety net, bail if the peer cache was destroyed.
+         */
+        if (crm_peer_cache == NULL) {
+            return NULL;
+        }
+
         if (crm_autoreap) {
             node = crm_update_peer_state(__FUNCTION__, node,
                                          is_set(node->processes, crm_get_cluster_proc())?
diff --git a/lib/common/Makefile.am b/lib/common/Makefile.am
index f5c0766..a593f40 100644
--- a/lib/common/Makefile.am
+++ b/lib/common/Makefile.am
@@ -37,7 +37,7 @@ if BUILD_CIBSECRETS
 libcrmcommon_la_SOURCES	+= cib_secrets.c
 endif
 
-libcrmcommon_la_LDFLAGS	= -version-info 8:0:5
+libcrmcommon_la_LDFLAGS	= -version-info 7:0:4
 libcrmcommon_la_LIBADD  = @LIBADD_DL@ $(GNUTLSLIBS)
 libcrmcommon_la_SOURCES += $(top_builddir)/lib/gnu/md5.c
 
diff --git a/lib/common/xml.c b/lib/common/xml.c
index e272049..8eed245 100644
--- a/lib/common/xml.c
+++ b/lib/common/xml.c
@@ -3430,12 +3430,18 @@ dump_xml_attr(xmlAttrPtr attr, int options, char **buffer, int *offset, int *max
 {
     char *p_value = NULL;
     const char *p_name = NULL;
+    xml_private_t *p = NULL;
 
     CRM_ASSERT(buffer != NULL);
     if (attr == NULL || attr->children == NULL) {
         return;
     }
 
+    p = attr->_private;
+    if (p && is_set(p->flags, xpf_deleted)) {
+        return;
+    }
+
     p_name = (const char *)attr->name;
     p_value = crm_xml_escape((const char *)attr->children->content);
     buffer_print(*buffer, *max, *offset, " %s=\"%s\"", p_name, p_value);
@@ -3812,6 +3818,10 @@ dump_xml_comment(xmlNode * data, int options, char **buffer, int *offset, int *m
 void
 crm_xml_dump(xmlNode * data, int options, char **buffer, int *offset, int *max, int depth)
 {
+    if(data == NULL) {
+        *offset = 0;
+        *max = 0;
+    }
 #if 0
     if (is_not_set(options, xml_log_option_filtered)) {
         /* Turning this code on also changes the PE tests for some reason
@@ -4564,6 +4574,8 @@ subtract_xml_object(xmlNode * parent, xmlNode * left, xmlNode * right,
     /* changes to name/value pairs */
     for (xIter = crm_first_attr(left); xIter != NULL; xIter = xIter->next) {
         const char *prop_name = (const char *)xIter->name;
+        xmlAttrPtr right_attr = NULL;
+        xml_private_t *p = NULL;
 
         if (strcmp(prop_name, XML_ATTR_ID) == 0) {
             continue;
@@ -4582,8 +4594,13 @@ subtract_xml_object(xmlNode * parent, xmlNode * left, xmlNode * right,
             continue;
         }
 
+        right_attr = xmlHasProp(right, (const xmlChar *)prop_name);
+        if (right_attr) {
+            p = right_attr->_private;
+        }
+
         right_val = crm_element_value(right, prop_name);
-        if (right_val == NULL) {
+        if (right_val == NULL || (p && is_set(p->flags, xpf_deleted))) {
             /* new */
             *changed = TRUE;
             if (full) {
diff --git a/lib/fencing/st_client.c b/lib/fencing/st_client.c
index 80f0064..67114c2 100644
--- a/lib/fencing/st_client.c
+++ b/lib/fencing/st_client.c
@@ -1100,57 +1100,62 @@ stonith_api_device_metadata(stonith_t * stonith, int call_options, const char *a
     if (safe_str_eq(provider, "redhat")) {
         stonith_action_t *action = stonith_action_create(agent, "metadata", NULL, 0, 5, NULL, NULL);
         int exec_rc = stonith_action_execute(action, &rc, &buffer);
+        xmlNode *xml = NULL;
+        xmlNode *actions = NULL;
+        xmlXPathObject *xpathObj = NULL;
 
         if (exec_rc < 0 || rc != 0 || buffer == NULL) {
+            crm_warn("Could not obtain metadata for %s", agent);
             crm_debug("Query failed: %d %d: %s", exec_rc, rc, crm_str(buffer));
             free(buffer);       /* Just in case */
             return -EINVAL;
+        }
 
-        } else {
-
-            xmlNode *xml = string2xml(buffer);
-            xmlNode *actions = NULL;
-            xmlXPathObject *xpathObj = NULL;
+        xml = string2xml(buffer);
+        if(xml == NULL) {
+            crm_warn("Metadata for %s is invalid", agent);
+            free(buffer);
+            return -EINVAL;
+        }
 
-            xpathObj = xpath_search(xml, "//actions");
-            if (numXpathResults(xpathObj) > 0) {
-                actions = getXpathResult(xpathObj, 0);
-            }
+        xpathObj = xpath_search(xml, "//actions");
+        if (numXpathResults(xpathObj) > 0) {
+            actions = getXpathResult(xpathObj, 0);
+        }
 
-            freeXpathObject(xpathObj);
+        freeXpathObject(xpathObj);
 
-            /* Now fudge the metadata so that the start/stop actions appear */
-            xpathObj = xpath_search(xml, "//action[@name='stop']");
-            if (numXpathResults(xpathObj) <= 0) {
-                xmlNode *tmp = NULL;
+        /* Now fudge the metadata so that the start/stop actions appear */
+        xpathObj = xpath_search(xml, "//action[@name='stop']");
+        if (numXpathResults(xpathObj) <= 0) {
+            xmlNode *tmp = NULL;
 
-                tmp = create_xml_node(actions, "action");
-                crm_xml_add(tmp, "name", "stop");
-                crm_xml_add(tmp, "timeout", "20s");
+            tmp = create_xml_node(actions, "action");
+            crm_xml_add(tmp, "name", "stop");
+            crm_xml_add(tmp, "timeout", "20s");
 
-                tmp = create_xml_node(actions, "action");
-                crm_xml_add(tmp, "name", "start");
-                crm_xml_add(tmp, "timeout", "20s");
-            }
+            tmp = create_xml_node(actions, "action");
+            crm_xml_add(tmp, "name", "start");
+            crm_xml_add(tmp, "timeout", "20s");
+        }
 
-            freeXpathObject(xpathObj);
+        freeXpathObject(xpathObj);
 
-            /* Now fudge the metadata so that the port isn't required in the configuration */
-            xpathObj = xpath_search(xml, "//parameter[@name='port']");
-            if (numXpathResults(xpathObj) > 0) {
-                /* We'll fill this in */
-                xmlNode *tmp = getXpathResult(xpathObj, 0);
+        /* Now fudge the metadata so that the port isn't required in the configuration */
+        xpathObj = xpath_search(xml, "//parameter[@name='port']");
+        if (numXpathResults(xpathObj) > 0) {
+            /* We'll fill this in */
+            xmlNode *tmp = getXpathResult(xpathObj, 0);
 
-                crm_xml_add(tmp, "required", "0");
-            }
+            crm_xml_add(tmp, "required", "0");
+        }
 
-            freeXpathObject(xpathObj);
-            free(buffer);
-            buffer = dump_xml_formatted(xml);
-            free_xml(xml);
-            if (!buffer) {
-                return -EINVAL;
-            }
+        freeXpathObject(xpathObj);
+        free(buffer);
+        buffer = dump_xml_formatted(xml);
+        free_xml(xml);
+        if (!buffer) {
+            return -EINVAL;
         }
 
     } else {
@@ -1280,7 +1285,10 @@ stonith_api_query(stonith_t * stonith, int call_options, const char *target,
 
             CRM_LOG_ASSERT(match != NULL);
             if(match != NULL) {
-                crm_info("%s[%d] = %s", "//@agent", lpc, xmlGetNodePath(match));
+                xmlChar *match_path = xmlGetNodePath(match);
+
+                crm_info("%s[%d] = %s", "//@agent", lpc, match_path);
+                free(match_path);
                 *devices = stonith_key_value_add(*devices, NULL, crm_element_value(match, XML_ATTR_ID));
             }
         }
diff --git a/lib/lrmd/Makefile.am b/lib/lrmd/Makefile.am
index e98d1e5..f961ae1 100644
--- a/lib/lrmd/Makefile.am
+++ b/lib/lrmd/Makefile.am
@@ -25,7 +25,7 @@ AM_CPPFLAGS         = -I$(top_builddir)/include  -I$(top_srcdir)/include     \
 lib_LTLIBRARIES = liblrmd.la
 
 liblrmd_la_SOURCES = lrmd_client.c proxy_common.c
-liblrmd_la_LDFLAGS = -version-info 3:0:0
+liblrmd_la_LDFLAGS = -version-info 3:0:2
 liblrmd_la_LIBADD = $(top_builddir)/lib/common/libcrmcommon.la	\
 			$(top_builddir)/lib/services/libcrmservice.la \
 			$(top_builddir)/lib/fencing/libstonithd.la
diff --git a/lib/pengine/Makefile.am b/lib/pengine/Makefile.am
index 29b7206..78da075 100644
--- a/lib/pengine/Makefile.am
+++ b/lib/pengine/Makefile.am
@@ -30,7 +30,7 @@ libpe_rules_la_LDFLAGS	= -version-info 2:4:0
 libpe_rules_la_SOURCES	= rules.c common.c
 libpe_rules_la_LIBADD	= $(top_builddir)/lib/common/libcrmcommon.la
 
-libpe_status_la_LDFLAGS	= -version-info 8:0:0
+libpe_status_la_LDFLAGS	= -version-info 8:0:4
 libpe_status_la_SOURCES	=  status.c unpack.c utils.c complex.c native.c group.c clone.c rules.c common.c
 libpe_status_la_LIBADD	=  @CURSESLIBS@ $(top_builddir)/lib/common/libcrmcommon.la
 
diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c
index 73c44a8..106c674 100644
--- a/lib/pengine/unpack.c
+++ b/lib/pengine/unpack.c
@@ -2834,8 +2834,9 @@ static bool check_operation_expiry(resource_t *rsc, node_t *node, int rc, xmlNod
 
             node_t *remote_node = pe_find_node(data_set->nodes, rsc->id);
             if (remote_node && remote_node->details->remote_was_fenced == 0) {
-            
-                crm_info("Waiting to clear monitor failure for remote node %s until fencing has occured", rsc->id); 
+                if (strstr(ID(xml_op), "last_failure")) {
+                    crm_info("Waiting to clear monitor failure for remote node %s until fencing has occured", rsc->id); 
+                }
                 /* disabling failure timeout for this operation because we believe
                  * fencing of the remote node should occur first. */ 
                 failure_timeout = 0;
@@ -2866,6 +2867,9 @@ static bool check_operation_expiry(resource_t *rsc, node_t *node, int rc, xmlNod
                 } else {
                     expired = FALSE;
                 }
+            } else if (rsc->remote_reconnect_interval && strstr(ID(xml_op), "last_failure")) {
+                /* always clear last failure when reconnect interval is set */
+                clear_failcount = 1;
             }
         }
 
diff --git a/lib/services/pcmk-dbus.h b/lib/services/pcmk-dbus.h
index afb8a2a..b9a713b 100644
--- a/lib/services/pcmk-dbus.h
+++ b/lib/services/pcmk-dbus.h
@@ -1,3 +1,7 @@
+#ifndef DBUS_TIMEOUT_USE_DEFAULT
+#  define DBUS_TIMEOUT_USE_DEFAULT -1
+#endif
+
 DBusConnection *pcmk_dbus_connect(void);
 void pcmk_dbus_connection_setup_with_select(DBusConnection *c);
 void pcmk_dbus_disconnect(DBusConnection *connection);
diff --git a/lrmd/lrmd.c b/lrmd/lrmd.c
index bd4d33e..0cf98cc 100644
--- a/lrmd/lrmd.c
+++ b/lrmd/lrmd.c
@@ -219,6 +219,7 @@ free_lrmd_cmd(lrmd_cmd_t * cmd)
     }
     free(cmd->origin);
     free(cmd->action);
+    free(cmd->real_action);
     free(cmd->userdata_str);
     free(cmd->rsc_id);
     free(cmd->output);
diff --git a/pacemaker.spec.in b/pacemaker.spec.in
index 0e3200f..2dfb4a6 100644
--- a/pacemaker.spec.in
+++ b/pacemaker.spec.in
@@ -54,7 +54,7 @@
 
 Name:          pacemaker
 Summary:       Scalable High-Availability cluster resource manager
-Version:       1.1.11
+Version:       1.1.13
 Release:       %{pcmk_release}%{?dist}
 License:       GPLv2+ and LGPLv2+
 Url:           http://www.clusterlabs.org
diff --git a/pengine/Makefile.am b/pengine/Makefile.am
index d14d911..31532cf 100644
--- a/pengine/Makefile.am
+++ b/pengine/Makefile.am
@@ -61,7 +61,7 @@ endif
 noinst_HEADERS	= allocate.h utils.h pengine.h
 #utils.h pengine.h
 
-libpengine_la_LDFLAGS	= -version-info 8:0:0
+libpengine_la_LDFLAGS	= -version-info 8:0:4
 # -L$(top_builddir)/lib/pils -lpils -export-dynamic -module -avoid-version
 libpengine_la_SOURCES	= pengine.c allocate.c utils.c constraints.c
 libpengine_la_SOURCES  += native.c group.c clone.c master.c graph.c utilization.c
diff --git a/pengine/allocate.c b/pengine/allocate.c
index 4b6fca1..68cafd4 100644
--- a/pengine/allocate.c
+++ b/pengine/allocate.c
@@ -1681,10 +1681,38 @@ apply_remote_node_ordering(pe_working_set_t *data_set)
         resource_t *remote_rsc = NULL;
         resource_t *container = NULL;
 
+        if (action->rsc == NULL) {
+            continue;
+        }
+
+        /* Special case. */
+        if (action->rsc &&
+            action->rsc->is_remote_node &&
+            safe_str_eq(action->task, CRM_OP_CLEAR_FAILCOUNT)) {
+
+            /* if we are clearing the failcount of an actual remote node connect
+             * resource, then make sure this happens before allowing the connection
+             * to start if we are planning on starting the connection during this
+             * transition */ 
+            custom_action_order(action->rsc,
+                NULL,
+                action,
+                action->rsc,
+                generate_op_key(action->rsc->id, RSC_START, 0),
+                NULL,
+                pe_order_optional,
+                data_set);
+
+                continue;
+        }
+
+        /* detect if the action occurs on a remote node. if so create
+         * ordering constraints that guarantee the action occurs while
+         * the remote node is active (after start, before stop...) things
+         * like that */ 
         if (action->node == NULL ||
             is_remote_node(action->node) == FALSE ||
             action->node->details->remote_rsc == NULL ||
-            action->rsc == NULL ||
             is_set(action->flags, pe_action_pseudo)) {
             continue;
         }
diff --git a/pengine/regression.sh b/pengine/regression.sh
index d57da17..d184798 100755
--- a/pengine/regression.sh
+++ b/pengine/regression.sh
@@ -566,6 +566,8 @@ do_test colocated-utilization-primitive-2 "Colocated Utilization - Choose the mo
 do_test colocated-utilization-group "Colocated Utilization - Group"
 do_test colocated-utilization-clone "Colocated Utilization - Clone"
 
+do_test utilization-check-allowed-nodes "Only check the capacities of the nodes that can run the resource"
+
 echo ""
 do_test reprobe-target_rc "Ensure correct target_rc for reprobe of inactive resources"
 do_test node-maintenance-1 "cl#5128 - Node maintenance"
diff --git a/pengine/test10/utilization-check-allowed-nodes.dot b/pengine/test10/utilization-check-allowed-nodes.dot
new file mode 100644
index 0000000..d09efbc
--- /dev/null
+++ b/pengine/test10/utilization-check-allowed-nodes.dot
@@ -0,0 +1,19 @@
+digraph "g" {
+"load_stopped_node1 node1" [ style=bold color="green" fontcolor="orange"]
+"load_stopped_node2 node2" [ style=bold color="green" fontcolor="orange"]
+"probe_complete node1" -> "probe_complete" [ style = bold]
+"probe_complete node1" [ style=bold color="green" fontcolor="black"]
+"probe_complete node2" -> "probe_complete" [ style = bold]
+"probe_complete node2" [ style=bold color="green" fontcolor="black"]
+"probe_complete" -> "rsc1_start_0 node2" [ style = bold]
+"probe_complete" [ style=bold color="green" fontcolor="orange"]
+"rsc1_monitor_0 node1" -> "probe_complete node1" [ style = bold]
+"rsc1_monitor_0 node1" [ style=bold color="green" fontcolor="black"]
+"rsc1_monitor_0 node2" -> "probe_complete node2" [ style = bold]
+"rsc1_monitor_0 node2" [ style=bold color="green" fontcolor="black"]
+"rsc1_start_0 node2" [ style=bold color="green" fontcolor="black"]
+"rsc2_monitor_0 node1" -> "probe_complete node1" [ style = bold]
+"rsc2_monitor_0 node1" [ style=bold color="green" fontcolor="black"]
+"rsc2_monitor_0 node2" -> "probe_complete node2" [ style = bold]
+"rsc2_monitor_0 node2" [ style=bold color="green" fontcolor="black"]
+}
diff --git a/pengine/test10/utilization-check-allowed-nodes.exp b/pengine/test10/utilization-check-allowed-nodes.exp
new file mode 100644
index 0000000..134ccb3
--- /dev/null
+++ b/pengine/test10/utilization-check-allowed-nodes.exp
@@ -0,0 +1,112 @@
+<transition_graph cluster-delay="60s" stonith-timeout="60s" failed-stop-offset="INFINITY" failed-start-offset="INFINITY"  transition_id="0">
+  <synapse id="0">
+    <action_set>
+      <rsc_op id="11" operation="start" operation_key="rsc1_start_0" on_node="node2" on_node_uuid="node2">
+        <primitive id="rsc1" class="ocf" provider="pacemaker" type="Dummy"/>
+        <attributes CRM_meta_timeout="20000" />
+      </rsc_op>
+    </action_set>
+    <inputs>
+      <trigger>
+        <pseudo_event id="4" operation="probe_complete" operation_key="probe_complete"/>
+      </trigger>
+    </inputs>
+  </synapse>
+  <synapse id="1">
+    <action_set>
+      <rsc_op id="9" operation="monitor" operation_key="rsc1_monitor_0" on_node="node2" on_node_uuid="node2">
+        <primitive id="rsc1" class="ocf" provider="pacemaker" type="Dummy"/>
+        <attributes CRM_meta_op_target_rc="7" CRM_meta_timeout="20000" />
+      </rsc_op>
+    </action_set>
+    <inputs/>
+  </synapse>
+  <synapse id="2">
+    <action_set>
+      <rsc_op id="6" operation="monitor" operation_key="rsc1_monitor_0" on_node="node1" on_node_uuid="node1">
+        <primitive id="rsc1" class="ocf" provider="pacemaker" type="Dummy"/>
+        <attributes CRM_meta_op_target_rc="7" CRM_meta_timeout="20000" />
+      </rsc_op>
+    </action_set>
+    <inputs/>
+  </synapse>
+  <synapse id="3">
+    <action_set>
+      <rsc_op id="10" operation="monitor" operation_key="rsc2_monitor_0" on_node="node2" on_node_uuid="node2">
+        <primitive id="rsc2" class="ocf" provider="pacemaker" type="Dummy"/>
+        <attributes CRM_meta_op_target_rc="7" CRM_meta_timeout="20000" />
+      </rsc_op>
+    </action_set>
+    <inputs/>
+  </synapse>
+  <synapse id="4">
+    <action_set>
+      <rsc_op id="7" operation="monitor" operation_key="rsc2_monitor_0" on_node="node1" on_node_uuid="node1">
+        <primitive id="rsc2" class="ocf" provider="pacemaker" type="Dummy"/>
+        <attributes CRM_meta_op_target_rc="7" CRM_meta_timeout="20000" />
+      </rsc_op>
+    </action_set>
+    <inputs/>
+  </synapse>
+  <synapse id="5" priority="1000000">
+    <action_set>
+      <rsc_op id="8" operation="probe_complete" operation_key="probe_complete-node2" on_node="node2" on_node_uuid="node2">
+        <attributes CRM_meta_op_no_wait="true" />
+      </rsc_op>
+    </action_set>
+    <inputs>
+      <trigger>
+        <rsc_op id="9" operation="monitor" operation_key="rsc1_monitor_0" on_node="node2" on_node_uuid="node2"/>
+      </trigger>
+      <trigger>
+        <rsc_op id="10" operation="monitor" operation_key="rsc2_monitor_0" on_node="node2" on_node_uuid="node2"/>
+      </trigger>
+    </inputs>
+  </synapse>
+  <synapse id="6" priority="1000000">
+    <action_set>
+      <rsc_op id="5" operation="probe_complete" operation_key="probe_complete-node1" on_node="node1" on_node_uuid="node1">
+        <attributes CRM_meta_op_no_wait="true" />
+      </rsc_op>
+    </action_set>
+    <inputs>
+      <trigger>
+        <rsc_op id="6" operation="monitor" operation_key="rsc1_monitor_0" on_node="node1" on_node_uuid="node1"/>
+      </trigger>
+      <trigger>
+        <rsc_op id="7" operation="monitor" operation_key="rsc2_monitor_0" on_node="node1" on_node_uuid="node1"/>
+      </trigger>
+    </inputs>
+  </synapse>
+  <synapse id="7">
+    <action_set>
+      <pseudo_event id="4" operation="probe_complete" operation_key="probe_complete">
+        <attributes />
+      </pseudo_event>
+    </action_set>
+    <inputs>
+      <trigger>
+        <rsc_op id="5" operation="probe_complete" operation_key="probe_complete-node1" on_node="node1" on_node_uuid="node1"/>
+      </trigger>
+      <trigger>
+        <rsc_op id="8" operation="probe_complete" operation_key="probe_complete-node2" on_node="node2" on_node_uuid="node2"/>
+      </trigger>
+    </inputs>
+  </synapse>
+  <synapse id="8">
+    <action_set>
+      <pseudo_event id="3" operation="load_stopped_node1" operation_key="load_stopped_node1">
+        <attributes />
+      </pseudo_event>
+    </action_set>
+    <inputs/>
+  </synapse>
+  <synapse id="9">
+    <action_set>
+      <pseudo_event id="2" operation="load_stopped_node2" operation_key="load_stopped_node2">
+        <attributes />
+      </pseudo_event>
+    </action_set>
+    <inputs/>
+  </synapse>
+</transition_graph>
diff --git a/pengine/test10/utilization-check-allowed-nodes.scores b/pengine/test10/utilization-check-allowed-nodes.scores
new file mode 100644
index 0000000..26887e2
--- /dev/null
+++ b/pengine/test10/utilization-check-allowed-nodes.scores
@@ -0,0 +1,5 @@
+Allocation scores:
+native_color: rsc1 allocation score on node1: -INFINITY
+native_color: rsc1 allocation score on node2: 0
+native_color: rsc2 allocation score on node1: -INFINITY
+native_color: rsc2 allocation score on node2: 0
diff --git a/pengine/test10/utilization-check-allowed-nodes.summary b/pengine/test10/utilization-check-allowed-nodes.summary
new file mode 100644
index 0000000..12bf19a
--- /dev/null
+++ b/pengine/test10/utilization-check-allowed-nodes.summary
@@ -0,0 +1,26 @@
+
+Current cluster status:
+Online: [ node1 node2 ]
+
+ rsc1	(ocf::pacemaker:Dummy):	Stopped
+ rsc2	(ocf::pacemaker:Dummy):	Stopped
+
+Transition Summary:
+ * Start   rsc1	(node2)
+
+Executing cluster transition:
+ * Resource action: rsc1            monitor on node2
+ * Resource action: rsc1            monitor on node1
+ * Resource action: rsc2            monitor on node2
+ * Resource action: rsc2            monitor on node1
+ * Pseudo action:   probe_complete
+ * Pseudo action:   load_stopped_node1
+ * Pseudo action:   load_stopped_node2
+ * Resource action: rsc1            start on node2
+
+Revised cluster status:
+Online: [ node1 node2 ]
+
+ rsc1	(ocf::pacemaker:Dummy):	Started node2
+ rsc2	(ocf::pacemaker:Dummy):	Stopped
+
diff --git a/pengine/test10/utilization-check-allowed-nodes.xml b/pengine/test10/utilization-check-allowed-nodes.xml
new file mode 100644
index 0000000..39cf51f
--- /dev/null
+++ b/pengine/test10/utilization-check-allowed-nodes.xml
@@ -0,0 +1,39 @@
+<cib epoch="1" num_updates="36" admin_epoch="0" validate-with="pacemaker-1.2" cib-last-written="Fri Dec  7 15:42:31 2012" have-quorum="1">
+  <configuration>
+    <crm_config>
+      <cluster_property_set id="cib-bootstrap-options">
+        <nvpair id="cib-bootstrap-options-stonith-enabled" name="stonith-enabled" value="false"/>
+        <nvpair id="cib-bootstrap-options-no-quorum-policy" name="no-quorum-policy" value="ignore"/>
+        <nvpair id="cib-bootstrap-options-placement-strategy" name="placement-strategy" value="utilization"/>
+      </cluster_property_set>
+    </crm_config>
+    <nodes>
+      <node id="node1" uname="node1">
+        <utilization id="node1-utlization">
+          <nvpair id="node1-utlization-cpu" name="cpu" value="4"/>
+        </utilization>
+      </node>
+      <node id="node2" uname="node2">
+        <utilization id="node2-utlization">
+          <nvpair id="node2-utlization-cpu" name="cpu" value="2"/>
+        </utilization>
+      </node>
+    </nodes>
+    <resources>
+      <primitive id="rsc1" class="ocf" provider="pacemaker" type="Dummy"/>
+      <primitive id="rsc2" class="ocf" provider="pacemaker" type="Dummy">
+        <utilization id="rsc2-utlization">
+          <nvpair id="rsc2-utlization-cpu" name="cpu" value="4"/>
+        </utilization>
+      </primitive>
+    </resources>
+    <constraints>
+      <rsc_location id="rsc1-location" rsc="rsc1" node="node1" score="-INFINITY"/>
+      <rsc_colocation id="rsc2-with-rsc1" rsc="rsc2" with-rsc="rsc1" score="INFINITY"/>
+    </constraints>
+  </configuration>
+  <status>
+    <node_state id="node1" uname="node1" in_ccm="true" crmd="online" join="member" expected="member" crm-debug-origin="crm_simulate"/>
+    <node_state id="node2" uname="node2" in_ccm="true" crmd="online" join="member" expected="member" crm-debug-origin="crm_simulate"/>
+  </status>
+</cib>
diff --git a/pengine/utilization.c b/pengine/utilization.c
index 982fcc9..db41b21 100644
--- a/pengine/utilization.c
+++ b/pengine/utilization.c
@@ -344,9 +344,10 @@ process_utilization(resource_t * rsc, node_t ** prefer, pe_working_set_t * data_
     int alloc_details = scores_log_level + 1;
 
     if (safe_str_neq(data_set->placement_strategy, "default")) {
-        GListPtr gIter = NULL;
+        GHashTableIter iter;
         GListPtr colocated_rscs = NULL;
         gboolean any_capable = FALSE;
+        node_t *node = NULL;
 
         colocated_rscs = find_colocated_rscs(colocated_rscs, rsc, rsc);
         if (colocated_rscs) {
@@ -356,8 +357,11 @@ process_utilization(resource_t * rsc, node_t ** prefer, pe_working_set_t * data_
 
             unallocated_utilization = sum_unallocated_utilization(rsc, colocated_rscs);
 
-            for (gIter = data_set->nodes; gIter != NULL; gIter = gIter->next) {
-                node_t *node = (node_t *) gIter->data;
+            g_hash_table_iter_init(&iter, rsc->allowed_nodes);
+            while (g_hash_table_iter_next(&iter, NULL, (void **)&node)) {
+                if (can_run_resources(node) == FALSE || node->weight < 0) {
+                    continue;
+                }
 
                 if (have_enough_capacity(node, rscs_id, unallocated_utilization)) {
                     any_capable = TRUE;
@@ -371,8 +375,11 @@ process_utilization(resource_t * rsc, node_t ** prefer, pe_working_set_t * data_
             }
 
             if (any_capable) {
-                for (gIter = data_set->nodes; gIter != NULL; gIter = gIter->next) {
-                    node_t *node = (node_t *) gIter->data;
+                g_hash_table_iter_init(&iter, rsc->allowed_nodes);
+                while (g_hash_table_iter_next(&iter, NULL, (void **)&node)) {
+                    if (can_run_resources(node) == FALSE || node->weight < 0) {
+                        continue;
+                    }
 
                     if (have_enough_capacity(node, rscs_id, unallocated_utilization) == FALSE) {
                         pe_rsc_debug(rsc, "Resource %s and its colocated resources cannot be allocated to node %s: no enough capacity",
@@ -394,8 +401,11 @@ process_utilization(resource_t * rsc, node_t ** prefer, pe_working_set_t * data_
         }
 
         if (any_capable == FALSE) {
-            for (gIter = data_set->nodes; gIter != NULL; gIter = gIter->next) {
-                node_t *node = (node_t *) gIter->data;
+            g_hash_table_iter_init(&iter, rsc->allowed_nodes);
+            while (g_hash_table_iter_next(&iter, NULL, (void **)&node)) {
+                if (can_run_resources(node) == FALSE || node->weight < 0) {
+                    continue;
+                }
 
                 if (have_enough_capacity(node, rsc->id, rsc->utilization) == FALSE) {
                     pe_rsc_debug(rsc, "Resource %s cannot be allocated to node %s: no enough capacity",
diff --git a/tools/fake_transition.c b/tools/fake_transition.c
index e8c37f7..fe5de95 100644
--- a/tools/fake_transition.c
+++ b/tools/fake_transition.c
@@ -65,11 +65,14 @@ inject_transient_attr(xmlNode * cib_node, const char *name, const char *value)
     xmlNode *attrs = NULL;
     xmlNode *container = NULL;
     xmlNode *nvp = NULL;
+    xmlChar *node_path;
     const char *node_uuid = ID(cib_node);
     char *nvp_id = crm_concat(name, node_uuid, '-');
 
-    quiet_log("Injecting attribute %s=%s into %s '%s'", name, value, xmlGetNodePath(cib_node),
+    node_path = xmlGetNodePath(cib_node);
+    quiet_log("Injecting attribute %s=%s into %s '%s'", name, value, node_path,
              ID(cib_node));
+    free(node_path);
 
     attrs = first_named_child(cib_node, XML_TAG_TRANSIENT_NODEATTRS);
     if (attrs == NULL) {
diff --git a/valgrind-pcmk.suppressions b/valgrind-pcmk.suppressions
index e7caa55..2e382df 100644
--- a/valgrind-pcmk.suppressions
+++ b/valgrind-pcmk.suppressions
@@ -20,6 +20,15 @@
 }
 
 {
+   Another bash leak
+   Memcheck:Leak
+   fun:malloc
+   fun:xmalloc
+   fun:set_default_locale
+   fun:main
+}
+
+{
    Ignore option parsing
    Memcheck:Leak
    fun:realloc
@@ -294,4 +303,4 @@
    obj:*/libgobject-*
    fun:call_init.part.0
    fun:_dl_init
-}
\ No newline at end of file
+}
diff --git a/version.m4 b/version.m4
index 22faf65..3d5e96b 100644
--- a/version.m4
+++ b/version.m4
@@ -1 +1 @@
-m4_define([VERSION_NUMBER], [1.1.12])
+m4_define([VERSION_NUMBER], [1.1.13])