Blob Blame History Raw
From 7615d866986131f3548b5863b38872a2e4dc58d9 Mon Sep 17 00:00:00 2001
From: Andrew Hanushevsky <abh@stanford.edu>
Date: Thu, 20 Jan 2022 23:22:09 -0800
Subject: [PATCH] [Server] Prevent SEGV on busy systems due to missing lock
 call for background jobs.

---
 src/XrdXrootd/XrdXrootdJob.cc | 50 +++++++++++++++++++++++++++++------
 1 file changed, 42 insertions(+), 8 deletions(-)

diff --git a/src/XrdXrootd/XrdXrootdJob.cc b/src/XrdXrootd/XrdXrootdJob.cc
index 1b2703d96..467323de5 100644
--- a/src/XrdXrootd/XrdXrootdJob.cc
+++ b/src/XrdXrootd/XrdXrootdJob.cc
@@ -37,6 +37,7 @@
 #include "Xrd/XrdScheduler.hh"
 #include "XrdOuc/XrdOucProg.hh"
 #include "XrdOuc/XrdOucStream.hh"
+#include "XrdSys/XrdSysError.hh"
 #include "XrdSys/XrdSysPlatform.hh"
 #include "XrdSys/XrdSysRAtomic.hh"
 #include "XrdXrootd/XrdXrootdJob.hh"
@@ -99,6 +100,12 @@ static const int          argvnum = sizeof(theArgs)/sizeof(theArgs[0]);
 /*                      G l o b a l   F u n c t i o n s                       */
 /******************************************************************************/
 
+namespace XrdXrootd
+{
+extern XrdSysError   eLog;
+}
+using namespace XrdXrootd;
+
 extern XrdSysTrace      XrdXrootdTrace;
   
 int XrdXrootdJobWaiting(XrdXrootdJob2Do *item, void *arg)
@@ -152,10 +159,15 @@ XrdXrootdJob2Do::~XrdXrootdJob2Do()
 /******************************************************************************/
 /*                                  D o I t                                   */
 /******************************************************************************/
+
+#define jobInfo theJob->JobName<<' '<<(theArgs[1] ? theArgs[1] : "")\
+                <<(theArgs[2] ? " " : "")<<(theArgs[2] ? theArgs[2] : "")
   
 void XrdXrootdJob2Do::DoIt()
 {
+   static const char *TraceID = "jobXeq";
    XrdXrootdJob2Do *jp = 0;
+   const char *endStat = " completed";
    char *lp = 0;
    int i, rc = 0;
 
@@ -163,9 +175,11 @@ void XrdXrootdJob2Do::DoIt()
 // perform the actual function and get the result and send to any async clients
 //
    if (Status != Job_Cancel)
-      {if ((rc = theJob->theProg->Run(&jobStream, theArgs[1], theArgs[2],
+      {TRACE(REQ, "Job "<<jobInfo<<" started");
+       if ((rc = theJob->theProg->Run(&jobStream, theArgs[1], theArgs[2],
                          theArgs[3], theArgs[4])))
           {Status = Job_Cancel;
+           endStat= " failed";
            lp = jobStream.GetLine();
            theJob->myMutex.Lock();
           }
@@ -173,15 +187,22 @@ void XrdXrootdJob2Do::DoIt()
                 rc = theJob->theProg->RunDone(jobStream);
                 theJob->myMutex.Lock();
                 if ((rc && rc != -EPIPE) || (rc == -EPIPE && (!lp || !(*lp))))
-                   Status = Job_Cancel;
+                   {Status = Job_Cancel; endStat = " failed";}
                    else if (Status != Job_Cancel)
                            {Status = Job_Done;
                             for (i = 0; i < numClients; i++)
                                 if (!Client[i].isSync) {sendResult(lp); break;}
                            }
                }
+       } else {
+        endStat = " cancelled";
+        theJob->myMutex.Lock();
        }
 
+// Produce a trace record
+//
+   TRACE(REQ, "Job "<<jobInfo<<endStat);
+
 // If the number of jobs > than the max allowed, then redrive a waiting job
 // if in fact we represent a legitimate job slot (this could a phantom slot
 // due to ourselves being cancelled.
@@ -313,7 +334,7 @@ XrdOucTList *XrdXrootdJob2Do::lstClient()
 /*                             v e r C l i e n t                              */
 /******************************************************************************/
   
-int XrdXrootdJob2Do::verClient(int dodel)
+int XrdXrootdJob2Do::verClient(int dodel) // Caller must have theJob->myMutex
 {
    int i, j, k;
 
@@ -330,9 +351,16 @@ int XrdXrootdJob2Do::verClient(int dodel)
 //
    if (!numClients && dodel)
       {XrdXrootdJob2Do *jp = theJob->JobTable.Remove(JobNum);
-       if (jp->Status == XrdXrootdJob2Do::Job_Waiting) theJob->numJobs--;
-       delete jp;
-       return 0;
+       if (jp)
+          {if (jp->Status == XrdXrootdJob2Do::Job_Waiting) theJob->numJobs--;
+           delete jp;
+           return 0;
+          } else {
+           char ebuff[80];
+           snprintf(ebuff, sizeof(ebuff), "Unable to find %s job %d;",
+                           theJob->JobName, JobNum);
+           eLog.Emsg("Job2Do", ebuff, "job slot disabled!");
+          }
       }
    return numClients;
 }
@@ -341,7 +369,7 @@ int XrdXrootdJob2Do::verClient(int dodel)
 /*                               R e d r i v e                                */
 /******************************************************************************/
   
-void XrdXrootdJob2Do::Redrive()
+void XrdXrootdJob2Do::Redrive() // Caller must have theJob->myMutex held
 {
    XrdXrootdJob2Do *jp;
    int Start = 0;
@@ -364,10 +392,12 @@ void XrdXrootdJob2Do::Redrive()
 /******************************************************************************/
 /*                            s e n d R e s u l t                             */
 /******************************************************************************/
+
+// Caller must have theJob->myMutex locked.
   
 void XrdXrootdJob2Do::sendResult(char *lp, int caned, int jrc)
 {
-   static const char *TraceID   = "sendResult";
+   static const char *TraceID   = "jobSendResult";
    static const kXR_int32 Xcan  = static_cast<kXR_int32>(htonl(kXR_Cancelled));
    XrdXrootdReqID ReqID;
    struct iovec   jobVec[6];
@@ -619,6 +649,8 @@ int XrdXrootdJob::Schedule(const char         *jkey,
 /*                               C l e a n U p                                */
 /******************************************************************************/
   
+// The caller must have myMutex locked
+
 void XrdXrootdJob::CleanUp(XrdXrootdJob2Do *jp)
 {
    int theStatus = jp->Status;
@@ -641,6 +673,8 @@ void XrdXrootdJob::CleanUp(XrdXrootdJob2Do *jp)
 /*                            s e n d R e s u l t                             */
 /******************************************************************************/
   
+// Caller must have myMutex locked.
+
 int XrdXrootdJob::sendResult(XrdXrootdResponse *resp,
                              const char        *rpfx,
                              XrdXrootdJob2Do   *job)
-- 
2.34.1