TCondor.cxx

Go to the documentation of this file.
00001 // @(#)root/proof:$Id: TCondor.cxx 35120 2010-09-02 11:11:23Z ganis $
00002 // Author: Maarten Ballintijn   06/12/03
00003 
00004 /*************************************************************************
00005  * Copyright (C) 1995-2001, Rene Brun and Fons Rademakers.               *
00006  * All rights reserved.                                                  *
00007  *                                                                       *
00008  * For the licensing terms see $ROOTSYS/LICENSE.                         *
00009  * For the list of contributors see $ROOTSYS/README/CREDITS.             *
00010  *************************************************************************/
00011 
00012 //////////////////////////////////////////////////////////////////////////
00013 //                                                                      //
00014 // TCondor                                                              //
00015 //                                                                      //
00016 // Interface to the Condor system. TCondor provides a (partial) API for //
00017 // querying and controlling the Condor system, including experimental   //
00018 // extensions like COD (computing on demand)                            //
00019 //                                                                      //
00020 //////////////////////////////////////////////////////////////////////////
00021 
00022 #include <stdlib.h>
00023 
00024 #include "TCondor.h"
00025 #include "TList.h"
00026 #include "TSystem.h"
00027 #include "TObjString.h"
00028 #include "TRegexp.h"
00029 #include "TProofDebug.h"
00030 #include "Riostream.h"
00031 #include "TEnv.h"
00032 #include "TClass.h"
00033 
00034 ClassImp(TCondorSlave)
00035 ClassImp(TCondor)
00036 
00037 
00038 //______________________________________________________________________________
00039 TCondor::TCondor(const char *pool) : fPool(pool), fState(kFree)
00040 {
00041    // Create Condor interface object. Uses Condor apps since there is no
00042    // API yet.
00043 
00044    fClaims = new TList;
00045 
00046    // Setup Condor
00047 
00048    TString condorHome = gEnv->GetValue("Proof.CondorHome", (char*)0);
00049    if (condorHome != "") {
00050       TString path = gSystem->Getenv("PATH");
00051       path = condorHome + "/bin:" + path;
00052       gSystem->Setenv("PATH",path);
00053    }
00054 
00055    TString condorConf = gEnv->GetValue("Proof.CondorConfig", (char*)0);
00056    if (condorConf != "") {
00057       gSystem->Setenv("CONDOR_CONFIG",condorConf);
00058    }
00059 
00060    char *loc = gSystem->Which(gSystem->Getenv("PATH"), "condor_cod",
00061                                                 kExecutePermission);
00062 
00063    if (loc) {
00064       fValid = kTRUE;
00065       delete [] loc;
00066    } else {
00067       fValid = kFALSE;
00068    }
00069 }
00070 
00071 
00072 //______________________________________________________________________________
00073 TCondor::~TCondor()
00074 {
00075    // Cleanup Condor interface.
00076 
00077    PDB(kCondor,1) Info("~TCondor","fState %d", fState );
00078 
00079    if (fState != kFree) {
00080       Release();
00081    }
00082    delete fClaims;
00083 }
00084 
00085 
00086 //______________________________________________________________________________
00087 void TCondor::Print(Option_t * opt) const
00088 {
00089    // Print master status
00090 
00091    cout << "OBJ: " << IsA()->GetName()
00092       << "\tPool: \"" << fPool << "\""
00093       << "\tState: " << fState << endl;
00094    fClaims->Print(opt);
00095 }
00096 
00097 
00098 //______________________________________________________________________________
00099 TCondorSlave *TCondor::ClaimVM(const char *vm, const char *cmd)
00100 {
00101    // Claim a VirtualMachine for PROOF usage.
00102 
00103 //    TString reinitCmd = "KRB5CCNAME=FILE:/tmp/condor.$$ && /usr/krb5/bin/kinit -F -k -t /etc/cdfcaf.keytab cafuser/cdf/h2caf@FNAL.GOV";
00104 //    gSystem->Exec(reinitCmd.Data());
00105    Int_t port = 0;
00106 
00107    TString claimCmd = Form("condor_cod request -name %s -timeout 10 2>>%s/condor.proof.%d",
00108                            vm, gSystem->TempDirectory(), gSystem->GetUid() );
00109 
00110    PDB(kCondor,2) Info("ClaimVM","command: %s", claimCmd.Data());
00111    FILE  *pipe = gSystem->OpenPipe(claimCmd, "r");
00112 
00113    if (!pipe) {
00114       SysError("ClaimVM","cannot run command: %s", claimCmd.Data());
00115       return 0;
00116    }
00117 
00118    TString claimId;
00119    TString line;
00120    while (line.Gets(pipe)) {
00121       PDB(kCondor,3) Info("ClaimVM","line = %s", line.Data());
00122 
00123       if (line.BeginsWith("ClaimId = \"")) {
00124          line.Remove(0, line.Index("\"")+1);
00125          line.Chop(); // remove trailing "
00126          claimId = line;
00127          PDB(kCondor,1) Info("ClaimVM","claim = '%s'", claimId.Data());
00128          TRegexp r("[0-9]*$");
00129          TString num = line(r);
00130          port = 37000 + atoi(num.Data());
00131          PDB(kCondor,1) Info("ClaimVM","port = %d", port);
00132       }
00133    }
00134 
00135    Int_t r = gSystem->ClosePipe(pipe);
00136    if (r) {
00137       Error("ClaimVM","command: %s returned %d", claimCmd.Data(), r);
00138       return 0;
00139    } else {
00140       PDB(kCondor,1) Info("ClaimVM","command: %s returned %d", claimCmd.Data(), r);
00141    }
00142 
00143    TString jobad("jobad");
00144    FILE *jf = gSystem->TempFileName(jobad);
00145 
00146    if (jf == 0) return 0;
00147 
00148    TString str(cmd);
00149    str.ReplaceAll("$(Port)", Form("%d", port));
00150    fputs(str, jf);
00151 
00152    fclose(jf);
00153 
00154    TString activateCmd = Form("condor_cod activate -id '%s' -jobad %s",
00155                               claimId.Data(), jobad.Data() );
00156 
00157    PDB(kCondor,2) Info("ClaimVM","command: %s", activateCmd.Data());
00158    pipe = gSystem->OpenPipe(activateCmd, "r");
00159 
00160    if (!pipe) {
00161       SysError("ClaimVM","cannot run command: %s", activateCmd.Data());
00162       return 0;
00163    }
00164 
00165    while (line.Gets(pipe)) {
00166       PDB(kCondor,3) Info("ClaimVM","Activate: line = %s", line.Data());
00167    }
00168 
00169    r = gSystem->ClosePipe(pipe);
00170    if (r) {
00171       Error("ClaimVM","command: %s returned %d", activateCmd.Data(), r);
00172    } else {
00173       PDB(kCondor,1) Info("ClaimVM","command: %s returned %d", activateCmd.Data(), r);
00174    }
00175 
00176    gSystem->Unlink(jobad);
00177 
00178    // TODO: get info at the start for all nodes ...
00179    TCondorSlave *claim = new TCondorSlave;
00180    claim->fClaimID = claimId;
00181    TString node(vm);
00182    node = node.Remove(0, node.Index("@")+1);
00183    claim->fHostname = node;
00184    claim->fPort = port;
00185    claim->fPerfIdx = 100; //set performance index to 100 by default
00186    claim->fImage = node; //set image to hostname by default
00187 
00188    return claim;
00189 }
00190 
00191 
00192 //______________________________________________________________________________
00193 TList *TCondor::GetVirtualMachines() const
00194 {
00195    // Get the names of the virtual machines in the pool.
00196    // Return a TList of TObjString or 0 in case of failure
00197 
00198    TString poolopt = fPool ? "" : Form("-pool %s", fPool.Data());
00199    TString cmd = Form("condor_status %s -format \"%%s\\n\" Name", poolopt.Data());
00200 
00201    PDB(kCondor,2) Info("GetVirtualMachines","command: %s", cmd.Data());
00202 
00203    FILE  *pipe = gSystem->OpenPipe(cmd, "r");
00204 
00205    if (!pipe) {
00206       SysError("GetVirtualMachines","cannot run command: %s", cmd.Data());
00207       return 0;
00208    }
00209 
00210    TString line;
00211    TList *l = new TList;
00212    while (line.Gets(pipe)) {
00213       PDB(kCondor,3) Info("GetVirtualMachines","line = %s", line.Data());
00214       if (line != "") l->Add(new TObjString(line));
00215    }
00216 
00217    Int_t r = gSystem->ClosePipe(pipe);
00218    if (r) {
00219       delete l;
00220       Error("GetVirtualMachines","command: %s returned %d", cmd.Data(), r);
00221       return 0;
00222    } else {
00223       PDB(kCondor,1) Info("GetVirtualMachines","command: %s returned %d", cmd.Data(), r);
00224    }
00225 
00226    return l;
00227 }
00228 
00229 
00230 //______________________________________________________________________________
00231 TList *TCondor::Claim(Int_t n, const char *cmd)
00232 {
00233    // Claim n virtual machines
00234    // This function figures out the image and performance index before returning
00235    // the list of condor slaves
00236 
00237    if (fState != kFree) {
00238       Error("Claim","not in state Free");
00239       return 0;
00240    }
00241 
00242    TList *vms = GetVirtualMachines();
00243    TIter next(vms);
00244    TObjString *vm;
00245    for(Int_t i=0; i < n && (vm = (TObjString*) next()) != 0; i++ ) {
00246       TCondorSlave *claim = ClaimVM(vm->GetName(), cmd);
00247       if (claim != 0) {
00248          if ( !GetVmInfo(vm->GetName(), claim->fImage, claim->fPerfIdx) ) {
00249             // assume vm is gone
00250             delete claim;
00251          } else {
00252             fClaims->Add(claim);
00253             fState = kActive;
00254          }
00255       }
00256    }
00257 
00258    return fClaims;
00259 }
00260 
00261 
00262 //______________________________________________________________________________
00263 TCondorSlave *TCondor::Claim(const char *vmname, const char *cmd)
00264 {
00265    // Claim virtual machine with name vmname
00266    // This function does not figure out the image and performance index before
00267    // returning the condor slave
00268 
00269    if (fState != kFree && fState != kActive) {
00270       Error("Claim","not in state Free or Active");
00271       return 0;
00272    }
00273 
00274    TCondorSlave *claim = ClaimVM(vmname, cmd);
00275    if (claim != 0) {
00276       fClaims->Add(claim);
00277       fState = kActive;
00278    }
00279 
00280    return claim;
00281 }
00282 
00283 
00284 //______________________________________________________________________________
00285 Bool_t TCondor::SetState(EState state)
00286 {
00287    // Set the state of workers
00288 
00289    PDB(kCondor,1) Info("SetState","state: %s (%lld)",
00290                        state == kSuspended ? "kSuspended" : "kActive", Long64_t(gSystem->Now()));
00291    TIter next(fClaims);
00292    TCondorSlave *claim;
00293    while((claim = (TCondorSlave*) next()) != 0) {
00294       TString cmd = Form("condor_cod %s -id '%s'",
00295                          state == kSuspended ? "suspend" : "resume",
00296                          claim->fClaimID.Data());
00297 
00298       PDB(kCondor,2) Info("SetState","command: %s", cmd.Data());
00299       FILE  *pipe = gSystem->OpenPipe(cmd, "r");
00300 
00301       if (!pipe) {
00302          SysError("SetState","cannot run command: %s", cmd.Data());
00303          return kFALSE;
00304       }
00305 
00306       TString line;
00307       while (line.Gets(pipe)) {
00308          PDB(kCondor,3) Info("SetState","line = %s", line.Data());
00309       }
00310 
00311       Int_t r = gSystem->ClosePipe(pipe);
00312       if (r) {
00313          Error("SetState","command: %s returned %d", cmd.Data(), r);
00314          return kFALSE;
00315       } else {
00316          PDB(kCondor,1) Info("SetState","command: %s returned %d", cmd.Data(), r);
00317       }
00318    }
00319 
00320    fState = state;
00321    return kTRUE;
00322 }
00323 
00324 
00325 //______________________________________________________________________________
00326 Bool_t TCondor::Suspend()
00327 {
00328    // Suspend worker
00329 
00330    if (fState != kActive) {
00331       Error("Suspend","not in state Active");
00332       return kFALSE;
00333    }
00334 
00335    return SetState(kSuspended);
00336 }
00337 
00338 
00339 //______________________________________________________________________________
00340 Bool_t TCondor::Resume()
00341 {
00342    // Resume worker
00343 
00344    if (fState != kSuspended) {
00345       Error("Suspend","not in state Suspended");
00346       return kFALSE;
00347    }
00348 
00349    return SetState(kActive);
00350 }
00351 
00352 
00353 //______________________________________________________________________________
00354 Bool_t TCondor::Release()
00355 {
00356    // Release worker
00357 
00358    if (fState == kFree) {
00359       Error("Suspend","not in state Active or Suspended");
00360       return kFALSE;
00361    }
00362 
00363    TCondorSlave *claim;
00364    while((claim = (TCondorSlave*) fClaims->First()) != 0) {
00365       TString cmd = Form("condor_cod release -id '%s'", claim->fClaimID.Data());
00366 
00367       PDB(kCondor,2) Info("SetState","command: %s", cmd.Data());
00368       FILE  *pipe = gSystem->OpenPipe(cmd, "r");
00369 
00370       if (!pipe) {
00371          SysError("Release","cannot run command: %s", cmd.Data());
00372          return kFALSE;
00373       }
00374 
00375       TString line;
00376       while (line.Gets(pipe)) {
00377          PDB(kCondor,3) Info("Release","line = %s", line.Data());
00378       }
00379 
00380       Int_t r = gSystem->ClosePipe(pipe);
00381       if (r) {
00382          Error("Release","command: %s returned %d", cmd.Data(), r);
00383          return kFALSE;
00384       } else {
00385          PDB(kCondor,1) Info("Release","command: %s returned %d", cmd.Data(), r);
00386       }
00387 
00388       fClaims->Remove(claim);
00389       delete claim;
00390    }
00391 
00392    fState = kFree;
00393    return kTRUE;
00394 }
00395 
00396 
00397 //______________________________________________________________________________
00398 Bool_t TCondor::GetVmInfo(const char *vm, TString &image, Int_t &perfidx) const
00399 {
00400    // Get info about worker status
00401 
00402    TString cmd = Form("condor_status -format \"%%d:\" Mips -format \"%%s\\n\" FileSystemDomain "
00403                       "-const 'Name==\"%s\"'", vm);
00404 
00405    PDB(kCondor,2) Info("GetVmInfo","command: %s", cmd.Data());
00406    FILE  *pipe = gSystem->OpenPipe(cmd, "r");
00407 
00408    if (!pipe) {
00409       SysError("GetVmInfo","cannot run command: %s", cmd.Data());
00410       return kFALSE;
00411    }
00412 
00413    TString line;
00414    while (line.Gets(pipe)) {
00415       PDB(kCondor,3) Info("GetVmInfo","line = %s", line.Data());
00416       if (line != "") {
00417          TString amips = line(TRegexp("^[0-9]*"));
00418          perfidx = atoi(amips);
00419          image = line(TRegexp("[^:]+$"));
00420          break;
00421       }
00422    }
00423 
00424    Int_t r = gSystem->ClosePipe(pipe);
00425    if (r) {
00426       Error("GetVmInfo","command: %s returned %d", cmd.Data(), r);
00427       return kFALSE;
00428    } else {
00429       PDB(kCondor,1) Info("GetVmInfo","command: %s returned %d", cmd.Data(), r);
00430    }
00431 
00432    return kTRUE;
00433 }
00434 
00435 
00436 //______________________________________________________________________________
00437 TString TCondor::GetImage(const char *host) const
00438 {
00439    // Get image of the worker
00440 
00441    TString cmd = Form("condor_status -direct %s -format \"Image:%%s\\n\" "
00442                       "FileSystemDomain", host);
00443 
00444    PDB(kCondor,2) Info("GetImage","command: %s", cmd.Data());
00445 
00446    FILE  *pipe = gSystem->OpenPipe(cmd, "r");
00447 
00448    if (!pipe) {
00449       SysError("GetImage","cannot run command: %s", cmd.Data());
00450       return "";
00451    }
00452 
00453    TString image;
00454    TString line;
00455    while (line.Gets(pipe)) {
00456       PDB(kCondor,3) Info("GetImage","line = %s", line.Data());
00457       if (line != "") {
00458          image = line(TRegexp("[^:]+$"));
00459          break;
00460       }
00461    }
00462 
00463    Int_t r = gSystem->ClosePipe(pipe);
00464    if (r) {
00465       Error("GetImage","command: %s returned %d", cmd.Data(), r);
00466       return "";
00467    } else {
00468       PDB(kCondor,1) Info("GetImage","command: %s returned %d", cmd.Data(), r);
00469    }
00470 
00471    return image;
00472 }
00473 
00474 
00475 //______________________________________________________________________________
00476 void TCondorSlave::Print(Option_t * /*opt*/ ) const
00477 {
00478    // Print worker status
00479 
00480    cout << "OBJ: " << IsA()->GetName()
00481       << " " << fHostname << ":" << fPort
00482       << "  Perf: " << fPerfIdx
00483       << "  Image: " << fImage << endl;
00484 }

Generated on Tue Jul 5 14:51:35 2011 for ROOT_528-00b_version by  doxygen 1.5.1