|
|
b3fe293 |
diff --git a/src/condor_daemon_client/dc_startd.cpp b/src/condor_daemon_client/dc_startd.cpp
|
|
|
b3fe293 |
index 7261c4a..09a2689 100644
|
|
|
b3fe293 |
--- a/src/condor_daemon_client/dc_startd.cpp
|
|
|
b3fe293 |
+++ b/src/condor_daemon_client/dc_startd.cpp
|
|
|
b3fe293 |
@@ -51,7 +51,7 @@ DCStartd::DCStartd( const char* tName, const char* tPool, const char* tAddr,
|
|
|
b3fe293 |
}
|
|
|
b3fe293 |
}
|
|
|
b3fe293 |
|
|
|
b3fe293 |
-DCStartd::DCStartd( ClassAd *ad, const char *tPool )
|
|
|
b3fe293 |
+DCStartd::DCStartd( const ClassAd *ad, const char *tPool )
|
|
|
b3fe293 |
: Daemon(ad,DT_STARTD,tPool),
|
|
|
b3fe293 |
claim_id(NULL)
|
|
|
b3fe293 |
{
|
|
|
b3fe293 |
diff --git a/src/condor_daemon_client/dc_startd.h b/src/condor_daemon_client/dc_startd.h
|
|
|
b3fe293 |
index c5f3e89..ff20892 100644
|
|
|
b3fe293 |
--- a/src/condor_daemon_client/dc_startd.h
|
|
|
b3fe293 |
+++ b/src/condor_daemon_client/dc_startd.h
|
|
|
b3fe293 |
@@ -49,7 +49,7 @@ public:
|
|
|
b3fe293 |
DCStartd( const char* const name, const char* const pool,
|
|
|
b3fe293 |
const char* const addr, const char* const id );
|
|
|
b3fe293 |
|
|
|
b3fe293 |
- DCStartd( ClassAd *ad, const char *pool = NULL );
|
|
|
b3fe293 |
+ DCStartd( const ClassAd *ad, const char *pool = NULL );
|
|
|
b3fe293 |
|
|
|
b3fe293 |
/// Destructor.
|
|
|
b3fe293 |
~DCStartd();
|
|
|
b3fe293 |
diff --git a/src/defrag/defrag.cpp b/src/defrag/defrag.cpp
|
|
|
b3fe293 |
index 26aec0a..8710b5d 100644
|
|
|
b3fe293 |
--- a/src/defrag/defrag.cpp
|
|
|
b3fe293 |
+++ b/src/defrag/defrag.cpp
|
|
|
b3fe293 |
@@ -185,6 +185,8 @@ void Defrag::config()
|
|
|
b3fe293 |
}
|
|
|
b3fe293 |
}
|
|
|
b3fe293 |
|
|
|
b3fe293 |
+ m_can_cancel = param_boolean("DEFRAG_CAN_CANCEL", true);
|
|
|
b3fe293 |
+
|
|
|
b3fe293 |
param(m_defrag_name,"DEFRAG_NAME");
|
|
|
b3fe293 |
|
|
|
b3fe293 |
int stats_quantum = m_polling_interval;
|
|
|
b3fe293 |
@@ -487,8 +489,17 @@ void Defrag::poll()
|
|
|
b3fe293 |
int num_whole_machines = countMachines(m_whole_machine_expr.c_str(),"DEFRAG_WHOLE_MACHINE_EXPR",&whole_machines);
|
|
|
b3fe293 |
m_stats.WholeMachines = num_whole_machines;
|
|
|
b3fe293 |
|
|
|
b3fe293 |
+ MachineSet draining_whole_machines;
|
|
|
b3fe293 |
+ std::stringstream draining_whole_machines_ss;
|
|
|
b3fe293 |
+ draining_whole_machines_ss << m_whole_machine_expr << " && Draining && Offline=!=True";
|
|
|
b3fe293 |
+ int num_draining_whole_machines = countMachines(draining_whole_machines_ss.str().c_str(),
|
|
|
b3fe293 |
+ "<DEFRAG_WHOLE_MACHINE_EXPR Draining>", &draining_whole_machines);
|
|
|
b3fe293 |
+
|
|
|
b3fe293 |
dprintf(D_ALWAYS,"There are currently %d draining and %d whole machines.\n",
|
|
|
b3fe293 |
num_draining,num_whole_machines);
|
|
|
b3fe293 |
+ if (num_draining_whole_machines)
|
|
|
b3fe293 |
+ dprintf(D_ALWAYS, "Of the %d whole machines, %d are in the draining state.\n",
|
|
|
b3fe293 |
+ num_whole_machines, num_draining_whole_machines);
|
|
|
b3fe293 |
|
|
|
b3fe293 |
queryDrainingCost();
|
|
|
b3fe293 |
|
|
|
b3fe293 |
@@ -548,8 +559,7 @@ void Defrag::poll()
|
|
|
b3fe293 |
|
|
|
b3fe293 |
ClassAdList startdAds;
|
|
|
b3fe293 |
std::string requirements;
|
|
|
b3fe293 |
- sprintf(requirements,"(%s) && Draining =!= true",m_defrag_requirements.c_str());
|
|
|
b3fe293 |
- if( !queryMachines(requirements.c_str(),"DEFRAG_REQUIREMENTS",startdAds) ) {
|
|
|
b3fe293 |
+ if( !queryMachines(m_defrag_requirements.c_str(),"DEFRAG_REQUIREMENTS",startdAds) ) {
|
|
|
b3fe293 |
dprintf(D_ALWAYS,"Doing nothing, because the query to select machines matching DEFRAG_REQUIREMENTS failed.\n");
|
|
|
b3fe293 |
return;
|
|
|
b3fe293 |
}
|
|
|
b3fe293 |
@@ -561,12 +571,26 @@ void Defrag::poll()
|
|
|
b3fe293 |
int num_drained = 0;
|
|
|
b3fe293 |
ClassAd *startd_ad;
|
|
|
b3fe293 |
MachineSet machines_done;
|
|
|
b3fe293 |
+ MachineSet draining_machines_done;
|
|
|
b3fe293 |
while( (startd_ad=startdAds.Next()) ) {
|
|
|
b3fe293 |
std::string machine;
|
|
|
b3fe293 |
std::string name;
|
|
|
b3fe293 |
startd_ad->LookupString(ATTR_NAME,name);
|
|
|
b3fe293 |
slotNameToDaemonName(name,machine);
|
|
|
b3fe293 |
|
|
|
b3fe293 |
+ if( !draining_machines_done.count(machine) && draining_whole_machines.count(machine) ) {
|
|
|
b3fe293 |
+ cancel_drain(*startd_ad);
|
|
|
b3fe293 |
+ draining_machines_done.insert(machine);
|
|
|
b3fe293 |
+ continue;
|
|
|
b3fe293 |
+ }
|
|
|
b3fe293 |
+
|
|
|
b3fe293 |
+ // Do not consider slots which are already draining.
|
|
|
b3fe293 |
+ bool startd_currently_draining = false;
|
|
|
b3fe293 |
+ startd_ad->LookupBool("Draining", startd_currently_draining);
|
|
|
b3fe293 |
+ if( startd_currently_draining ) {
|
|
|
b3fe293 |
+ continue;
|
|
|
b3fe293 |
+ }
|
|
|
b3fe293 |
+
|
|
|
b3fe293 |
if( machines_done.count(machine) ) {
|
|
|
b3fe293 |
dprintf(D_FULLDEBUG,
|
|
|
b3fe293 |
"Skipping %s: already attempted to drain %s in this cycle.\n",
|
|
|
b3fe293 |
@@ -581,14 +605,13 @@ void Defrag::poll()
|
|
|
b3fe293 |
continue;
|
|
|
b3fe293 |
}
|
|
|
b3fe293 |
|
|
|
b3fe293 |
- if( drain(startd_ad) ) {
|
|
|
b3fe293 |
+ if( (num_drained++ < num_to_drain) && drain(*startd_ad) ) {
|
|
|
b3fe293 |
machines_done.insert(machine);
|
|
|
b3fe293 |
|
|
|
b3fe293 |
- if( ++num_drained >= num_to_drain ) {
|
|
|
b3fe293 |
+ if( num_drained >= num_to_drain ) {
|
|
|
b3fe293 |
dprintf(D_ALWAYS,
|
|
|
b3fe293 |
"Drained maximum number of machines allowed in this cycle (%d).\n",
|
|
|
b3fe293 |
num_to_drain);
|
|
|
b3fe293 |
- break;
|
|
|
b3fe293 |
}
|
|
|
b3fe293 |
}
|
|
|
b3fe293 |
}
|
|
|
b3fe293 |
@@ -601,26 +624,24 @@ void Defrag::poll()
|
|
|
b3fe293 |
}
|
|
|
b3fe293 |
|
|
|
b3fe293 |
bool
|
|
|
b3fe293 |
-Defrag::drain(ClassAd *startd_ad)
|
|
|
b3fe293 |
+Defrag::drain(const ClassAd &startd_ad)
|
|
|
b3fe293 |
{
|
|
|
b3fe293 |
- ASSERT( startd_ad );
|
|
|
b3fe293 |
-
|
|
|
b3fe293 |
std::string name;
|
|
|
b3fe293 |
- startd_ad->LookupString(ATTR_NAME,name);
|
|
|
b3fe293 |
+ startd_ad.LookupString(ATTR_NAME,name);
|
|
|
b3fe293 |
|
|
|
b3fe293 |
dprintf(D_ALWAYS,"Initiating %s draining of %s.\n",
|
|
|
b3fe293 |
m_draining_schedule_str.c_str(),name.c_str());
|
|
|
b3fe293 |
|
|
|
b3fe293 |
- DCStartd startd( startd_ad );
|
|
|
b3fe293 |
+ DCStartd startd( &startd_ad );
|
|
|
b3fe293 |
|
|
|
b3fe293 |
int graceful_completion = 0;
|
|
|
b3fe293 |
- startd_ad->LookupInteger(ATTR_EXPECTED_MACHINE_GRACEFUL_DRAINING_COMPLETION,graceful_completion);
|
|
|
b3fe293 |
+ startd_ad.LookupInteger(ATTR_EXPECTED_MACHINE_GRACEFUL_DRAINING_COMPLETION,graceful_completion);
|
|
|
b3fe293 |
int quick_completion = 0;
|
|
|
b3fe293 |
- startd_ad->LookupInteger(ATTR_EXPECTED_MACHINE_QUICK_DRAINING_COMPLETION,quick_completion);
|
|
|
b3fe293 |
+ startd_ad.LookupInteger(ATTR_EXPECTED_MACHINE_QUICK_DRAINING_COMPLETION,quick_completion);
|
|
|
b3fe293 |
int graceful_badput = 0;
|
|
|
b3fe293 |
- startd_ad->LookupInteger(ATTR_EXPECTED_MACHINE_GRACEFUL_DRAINING_BADPUT,graceful_badput);
|
|
|
b3fe293 |
+ startd_ad.LookupInteger(ATTR_EXPECTED_MACHINE_GRACEFUL_DRAINING_BADPUT,graceful_badput);
|
|
|
b3fe293 |
int quick_badput = 0;
|
|
|
b3fe293 |
- startd_ad->LookupInteger(ATTR_EXPECTED_MACHINE_QUICK_DRAINING_BADPUT,quick_badput);
|
|
|
b3fe293 |
+ startd_ad.LookupInteger(ATTR_EXPECTED_MACHINE_QUICK_DRAINING_BADPUT,quick_badput);
|
|
|
b3fe293 |
|
|
|
b3fe293 |
time_t now = time(NULL);
|
|
|
b3fe293 |
std::string draining_check_expr;
|
|
|
b3fe293 |
@@ -659,6 +680,27 @@ Defrag::drain(ClassAd *startd_ad)
|
|
|
b3fe293 |
return true;
|
|
|
b3fe293 |
}
|
|
|
b3fe293 |
|
|
|
b3fe293 |
+bool
|
|
|
b3fe293 |
+Defrag::cancel_drain(const ClassAd &startd_ad)
|
|
|
b3fe293 |
+{
|
|
|
b3fe293 |
+
|
|
|
b3fe293 |
+ std::string name;
|
|
|
b3fe293 |
+ startd_ad.LookupString(ATTR_NAME,name);
|
|
|
b3fe293 |
+
|
|
|
b3fe293 |
+ dprintf(D_ALWAYS,"Initiating %s draining of %s.\n",
|
|
|
b3fe293 |
+ m_draining_schedule_str.c_str(),name.c_str());
|
|
|
b3fe293 |
+
|
|
|
b3fe293 |
+ DCStartd startd( &startd_ad );
|
|
|
b3fe293 |
+
|
|
|
b3fe293 |
+ bool rval = startd.cancelDrainJobs( NULL );
|
|
|
b3fe293 |
+ if ( rval ) {
|
|
|
b3fe293 |
+ dprintf(D_FULLDEBUG, "Sent request to cancel draining on %s\n", startd.name());
|
|
|
b3fe293 |
+ } else {
|
|
|
b3fe293 |
+ dprintf(D_ALWAYS, "Unable to cancel draining on %s: %s\n", startd.name(), startd.error());
|
|
|
b3fe293 |
+ }
|
|
|
b3fe293 |
+ return rval;
|
|
|
b3fe293 |
+}
|
|
|
b3fe293 |
+
|
|
|
b3fe293 |
void
|
|
|
b3fe293 |
Defrag::publish(ClassAd *ad)
|
|
|
b3fe293 |
{
|
|
|
b3fe293 |
diff --git a/src/defrag/defrag.h b/src/defrag/defrag.h
|
|
|
b3fe293 |
index 8c7fd51..909b569 100644
|
|
|
b3fe293 |
--- a/src/defrag/defrag.h
|
|
|
b3fe293 |
+++ b/src/defrag/defrag.h
|
|
|
b3fe293 |
@@ -40,11 +40,11 @@ class Defrag: public Service {
|
|
|
b3fe293 |
void stop();
|
|
|
b3fe293 |
|
|
|
b3fe293 |
void poll(); // do the periodic policy evaluation
|
|
|
b3fe293 |
- bool drain(ClassAd *startd_ad);
|
|
|
b3fe293 |
|
|
|
b3fe293 |
typedef std::set< std::string > MachineSet;
|
|
|
b3fe293 |
|
|
|
b3fe293 |
private:
|
|
|
b3fe293 |
+
|
|
|
b3fe293 |
int m_polling_interval; // delay between evaluations of the policy
|
|
|
b3fe293 |
int m_polling_timer;
|
|
|
b3fe293 |
double m_draining_per_hour;
|
|
|
b3fe293 |
@@ -58,6 +58,7 @@ class Defrag: public Service {
|
|
|
b3fe293 |
ClassAd m_rank_ad;
|
|
|
b3fe293 |
int m_draining_schedule;
|
|
|
b3fe293 |
std::string m_draining_schedule_str;
|
|
|
b3fe293 |
+ bool m_can_cancel; // Whether condor_defrag can also cancel draining early.
|
|
|
b3fe293 |
|
|
|
b3fe293 |
time_t m_last_poll;
|
|
|
b3fe293 |
|
|
|
b3fe293 |
@@ -70,6 +71,9 @@ class Defrag: public Service {
|
|
|
b3fe293 |
ClassAd m_public_ad;
|
|
|
b3fe293 |
DefragStats m_stats;
|
|
|
b3fe293 |
|
|
|
b3fe293 |
+ bool drain(const ClassAd &startd_ad);
|
|
|
b3fe293 |
+ bool cancel_drain(const ClassAd &startd_ad);
|
|
|
b3fe293 |
+
|
|
|
b3fe293 |
void validateExpr(char const *constraint,char const *constraint_source);
|
|
|
b3fe293 |
bool queryMachines(char const *constraint,char const *constraint_source,ClassAdList &startdAds);
|
|
|
b3fe293 |
|