TMVAMultipleBackgroundExample.cxx

Go to the documentation of this file.
00001 // @(#)root/tmva $Id: TMVAMultipleBackgroundExample.cxx 37030 2010-11-28 09:02:35Z evt $
00002 /**********************************************************************************
00003  * Project   : TMVA - a Root-integrated toolkit for multivariate data analysis    *
00004  * Package   : TMVA                                                               *
00005  * Exectuable: TMVAGAexample                                                        *
00006  *                                                                                *
00007  * This exectutable gives an example of a very simple use of the genetic algorithm*
00008  * of TMVA                                                                        *
00009  *                                                                                *
00010  **********************************************************************************/
00011 
00012 #include <iostream> // Stream declarations
00013 #include <vector>
00014 #include <limits>
00015 
00016 #include "TChain.h"
00017 #include "TCut.h"
00018 #include "TDirectory.h"
00019 #include "TH1F.h"
00020 #include "TH1.h"
00021 #include "TMath.h"
00022 #include "TFile.h"
00023 #include "TStopwatch.h"
00024 #include "TROOT.h"
00025 
00026 #include "TMVA/GeneticAlgorithm.h"
00027 #include "TMVA/GeneticFitter.h"
00028 #include "TMVA/IFitterTarget.h"
00029 #include "TMVA/Factory.h"
00030 #include "TMVA/Reader.h"
00031 
00032 using namespace std;
00033 
00034 namespace TMVA {
00035 
00036 // =========== Description =============
00037 // This example shows the training of signal with three different backgrounds
00038 // Then in the application a tree is created with all signal and background events where the true class ID and the three classifier outputs are added
00039 // finally with the application tree, the significance is maximized with the help of the TMVA genetic algrorithm.
00040 //
00041 
00042 
00043 // ------------------------------ Training ----------------------------------------------------------------
00044 void Training(){
00045    std::string factoryOptions( "!V:!Silent:Transformations=I;D;P;G,D:AnalysisType=Classification" );
00046    TString fname = "./tmva_example_multiple_background.root";
00047 
00048    TFile *input(0);
00049    input = TFile::Open( fname );
00050 
00051    TTree *signal      = (TTree*)input->Get("TreeS");
00052    TTree *background0 = (TTree*)input->Get("TreeB0");
00053    TTree *background1 = (TTree*)input->Get("TreeB1");
00054    TTree *background2 = (TTree*)input->Get("TreeB2");
00055 
00056    /// global event weights per tree (see below for setting event-wise weights)
00057    Double_t signalWeight      = 1.0;
00058    Double_t background0Weight = 1.0;
00059    Double_t background1Weight = 1.0;
00060    Double_t background2Weight = 1.0;
00061 
00062    // Create a new root output file.
00063    TString outfileName( "TMVASignalBackground0.root" );
00064    TFile* outputFile = TFile::Open( outfileName, "RECREATE" );
00065 
00066 
00067 
00068    // ===================== background 0
00069    TMVA::Factory *factory = new TMVA::Factory( "TMVAMultiBkg0", outputFile, factoryOptions );
00070    factory->AddVariable( "var1", "Variable 1", "", 'F' );
00071    factory->AddVariable( "var2", "Variable 2", "", 'F' );
00072    factory->AddVariable( "var3", "Variable 3", "units", 'F' );
00073    factory->AddVariable( "var4", "Variable 4", "units", 'F' );
00074 
00075    factory->AddSignalTree    ( signal,     signalWeight       );
00076    factory->AddBackgroundTree( background0, background0Weight );
00077 
00078 //   factory->SetBackgroundWeightExpression("weight");
00079    TCut mycuts = ""; // for example: TCut mycuts = "abs(var1)<0.5 && abs(var2-0.5)<1";
00080    TCut mycutb = ""; // for example: TCut mycutb = "abs(var1)<0.5";
00081 
00082    // tell the factory to use all remaining events in the trees after training for testing:
00083    factory->PrepareTrainingAndTestTree( mycuts, mycutb,
00084                                         "nTrain_Signal=0:nTrain_Background=0:SplitMode=Random:NormMode=NumEvents:!V" );
00085 
00086    // Boosted Decision Trees
00087    factory->BookMethod( TMVA::Types::kBDT, "BDTG",
00088                         "!H:!V:NTrees=1000:BoostType=Grad:Shrinkage=0.30:UseBaggedGrad:GradBaggingFraction=0.6:SeparationType=GiniIndex:nCuts=20:NNodesMax=5" );
00089    factory->TrainAllMethods();
00090    factory->TestAllMethods();
00091    factory->EvaluateAllMethods();
00092 
00093    outputFile->Close();
00094 
00095    delete factory;
00096 
00097 
00098 
00099    //  ===================== background 1
00100 
00101    outfileName = "TMVASignalBackground1.root";
00102    outputFile = TFile::Open( outfileName, "RECREATE" );
00103 
00104    factory = new TMVA::Factory( "TMVAMultiBkg1", outputFile, factoryOptions );
00105    factory->AddVariable( "var1", "Variable 1", "", 'F' );
00106    factory->AddVariable( "var2", "Variable 2", "", 'F' );
00107    factory->AddVariable( "var3", "Variable 3", "units", 'F' );
00108    factory->AddVariable( "var4", "Variable 4", "units", 'F' );
00109 
00110    factory->AddSignalTree    ( signal,     signalWeight       );
00111    factory->AddBackgroundTree( background1, background1Weight );
00112 
00113 //   factory->SetBackgroundWeightExpression("weight");
00114 
00115    // tell the factory to use all remaining events in the trees after training for testing:
00116    factory->PrepareTrainingAndTestTree( mycuts, mycutb,
00117                                         "nTrain_Signal=0:nTrain_Background=0:SplitMode=Random:NormMode=NumEvents:!V" );
00118 
00119    // Boosted Decision Trees
00120    factory->BookMethod( TMVA::Types::kBDT, "BDTG",
00121                         "!H:!V:NTrees=1000:BoostType=Grad:Shrinkage=0.30:UseBaggedGrad:GradBaggingFraction=0.6:SeparationType=GiniIndex:nCuts=20:NNodesMax=5" );
00122    factory->TrainAllMethods();
00123    factory->TestAllMethods();
00124    factory->EvaluateAllMethods();
00125 
00126    outputFile->Close();
00127 
00128    delete factory;
00129 
00130 
00131 
00132 
00133    //  ===================== background 2
00134    outfileName = "TMVASignalBackground2.root";
00135    outputFile = TFile::Open( outfileName, "RECREATE" );
00136 
00137    factory = new TMVA::Factory( "TMVAMultiBkg2", outputFile, factoryOptions );
00138    factory->AddVariable( "var1", "Variable 1", "", 'F' );
00139    factory->AddVariable( "var2", "Variable 2", "", 'F' );
00140    factory->AddVariable( "var3", "Variable 3", "units", 'F' );
00141    factory->AddVariable( "var4", "Variable 4", "units", 'F' );
00142 
00143    factory->AddSignalTree    ( signal,     signalWeight       );
00144    factory->AddBackgroundTree( background2, background2Weight );
00145 
00146 //   factory->SetBackgroundWeightExpression("weight");
00147 
00148    // tell the factory to use all remaining events in the trees after training for testing:
00149    factory->PrepareTrainingAndTestTree( mycuts, mycutb,
00150                                         "nTrain_Signal=0:nTrain_Background=0:SplitMode=Random:NormMode=NumEvents:!V" );
00151 
00152    // Boosted Decision Trees
00153    factory->BookMethod( TMVA::Types::kBDT, "BDTG",
00154                         "!H:!V:NTrees=1000:BoostType=Grad:Shrinkage=0.30:UseBaggedGrad:GradBaggingFraction=0.5:SeparationType=GiniIndex:nCuts=20:NNodesMax=5" );
00155    factory->TrainAllMethods();
00156    factory->TestAllMethods();
00157    factory->EvaluateAllMethods();
00158 
00159    outputFile->Close();
00160 
00161    delete factory;
00162    
00163 }
00164 
00165 
00166 
00167 
00168 
00169 // ------------------------------ Application ----------------------------------------------------------------
00170 // create a summary tree with all signal and background events and for each event the three classifier values and the true classID
00171 void ApplicationCreateCombinedTree(){
00172 
00173    // Create a new root output file.
00174    TString outfileName( "tmva_example_multiple_backgrounds__applied.root" );
00175    TFile* outputFile = TFile::Open( outfileName, "RECREATE" );
00176    TTree* outputTree = new TTree("multiBkg","multiple backgrounds tree");
00177 
00178    Float_t var1, var2;
00179    Float_t var3, var4;
00180    Int_t   classID = 0;
00181    Float_t weight = 1.f;
00182    
00183    Float_t classifier0, classifier1, classifier2;
00184 
00185    outputTree->Branch("classID", &classID, "classID/I");
00186    outputTree->Branch("var1", &var1, "var1/F");
00187    outputTree->Branch("var2", &var2, "var2/F");
00188    outputTree->Branch("var3", &var3, "var3/F");
00189    outputTree->Branch("var4", &var4, "var4/F");
00190    outputTree->Branch("weight", &weight, "weight/F");
00191    outputTree->Branch("cls0", &classifier0, "cls0/F");
00192    outputTree->Branch("cls1", &classifier1, "cls1/F");
00193    outputTree->Branch("cls2", &classifier2, "cls2/F");
00194 
00195 
00196    // ===== create three readers for the three different signal/background classifications, .. one for each background
00197    TMVA::Reader *reader0 = new TMVA::Reader( "!Color:!Silent" );
00198    TMVA::Reader *reader1 = new TMVA::Reader( "!Color:!Silent" );
00199    TMVA::Reader *reader2 = new TMVA::Reader( "!Color:!Silent" );
00200 
00201    reader0->AddVariable( "var1", &var1 );
00202    reader0->AddVariable( "var2", &var2 );
00203    reader0->AddVariable( "var3", &var3 );
00204    reader0->AddVariable( "var4", &var4 );
00205 
00206    reader1->AddVariable( "var1", &var1 );
00207    reader1->AddVariable( "var2", &var2 );
00208    reader1->AddVariable( "var3", &var3 );
00209    reader1->AddVariable( "var4", &var4 );
00210 
00211    reader2->AddVariable( "var1", &var1 );
00212    reader2->AddVariable( "var2", &var2 );
00213    reader2->AddVariable( "var3", &var3 );
00214    reader2->AddVariable( "var4", &var4 );
00215 
00216    // ====== load the weight files for the readers
00217    TString method =  "BDT method";
00218    reader0->BookMVA( "BDT method", "weights/TMVAMultiBkg0_BDTG.weights.xml" );
00219    reader1->BookMVA( "BDT method", "weights/TMVAMultiBkg1_BDTG.weights.xml" );
00220    reader2->BookMVA( "BDT method", "weights/TMVAMultiBkg2_BDTG.weights.xml" );
00221 
00222    // ===== load the input file
00223    TFile *input(0);
00224    TString fname = "./tmva_example_multiple_background.root";
00225    input = TFile::Open( fname );
00226 
00227    TTree* theTree = NULL;
00228 
00229    // ===== loop through signal and all background trees
00230    for( int treeNumber = 0; treeNumber < 4; ++treeNumber ) {
00231       if( treeNumber == 0 ){
00232          theTree = (TTree*)input->Get("TreeS");
00233          std::cout << "--- Select signal sample" << std::endl;
00234 //       theTree->SetBranchAddress( "weight", &weight );
00235          weight = 1;
00236          classID = 0;
00237       }else if( treeNumber == 1 ){
00238          theTree = (TTree*)input->Get("TreeB0");
00239          std::cout << "--- Select background 0 sample" << std::endl;
00240 //       theTree->SetBranchAddress( "weight", &weight );
00241          weight = 1;
00242          classID = 1;
00243       }else if( treeNumber == 2 ){
00244          theTree = (TTree*)input->Get("TreeB1");
00245          std::cout << "--- Select background 1 sample" << std::endl;
00246 //       theTree->SetBranchAddress( "weight", &weight );
00247          weight = 1;
00248          classID = 2;
00249       }else if( treeNumber == 3 ){
00250          theTree = (TTree*)input->Get("TreeB2");
00251          std::cout << "--- Select background 2 sample" << std::endl;
00252 //       theTree->SetBranchAddress( "weight", &weight );
00253          weight = 1;
00254          classID = 3;
00255       }
00256 
00257 
00258       theTree->SetBranchAddress( "var1", &var1 );
00259       theTree->SetBranchAddress( "var2", &var2 );
00260       theTree->SetBranchAddress( "var3", &var3 );
00261       theTree->SetBranchAddress( "var4", &var4 );
00262 
00263 
00264       std::cout << "--- Processing: " << theTree->GetEntries() << " events" << std::endl;
00265       TStopwatch sw;
00266       sw.Start();
00267       Int_t nEvent = theTree->GetEntries();
00268 //      Int_t nEvent = 100;
00269       for (Long64_t ievt=0; ievt<nEvent; ievt++) {
00270 
00271          if (ievt%1000 == 0){
00272             std::cout << "--- ... Processing event: " << ievt << std::endl;
00273          }
00274 
00275          theTree->GetEntry(ievt);
00276       
00277          // ==== get the classifiers for each of the signal/background classifications
00278          classifier0 = reader0->EvaluateMVA( method );
00279          classifier1 = reader1->EvaluateMVA( method );
00280          classifier2 = reader2->EvaluateMVA( method );
00281 
00282          outputTree->Fill();
00283       }
00284 
00285 
00286       // get elapsed time
00287       sw.Stop();
00288       std::cout << "--- End of event loop: "; sw.Print();
00289    }
00290    input->Close();
00291 
00292 
00293    // write output tree
00294 //   outputTree->SetDirectory(outputFile);
00295 //   outputTree->Write();
00296    outputFile->Write();
00297 
00298    outputFile->Close();
00299 
00300    std::cout << "--- Created root file: \"" << outfileName.Data() << "\" containing the MVA output histograms" << std::endl;
00301   
00302    delete reader0;
00303    delete reader1;
00304    delete reader2;
00305     
00306    std::cout << "==> Application of readers is done! combined tree created" << std::endl << std::endl;
00307    
00308 }
00309 
00310 
00311 
00312 
00313 // ------------------------------ Genetic Algorithm Fitness definition ----------------------------------------------------------------
00314 class MyFitness : public IFitterTarget {
00315 public:
00316    // constructor
00317    MyFitness( TChain* _chain ) : IFitterTarget() {
00318       chain = _chain;
00319 
00320       hSignal = new TH1F("hsignal","hsignal",100,-1,1);
00321       hFP = new TH1F("hfp","hfp",100,-1,1);
00322       hTP = new TH1F("htp","htp",100,-1,1);
00323 
00324       TString cutsAndWeightSignal  = "weight*(classID==0)";
00325       nSignal = chain->Draw("Entry$/Entries$>>hsignal",cutsAndWeightSignal,"goff");
00326       weightsSignal = hSignal->Integral();
00327 
00328    }
00329        
00330    // the output of this function will be minimized
00331    Double_t EstimatorFunction( std::vector<Double_t> & factors ){
00332 
00333       TString cutsAndWeightTruePositive  = Form("weight*((classID==0) && cls0>%f && cls1>%f && cls2>%f )",factors.at(0), factors.at(1), factors.at(2));
00334       TString cutsAndWeightFalsePositive = Form("weight*((classID >0) && cls0>%f && cls1>%f && cls2>%f )",factors.at(0), factors.at(1), factors.at(2));
00335           
00336       // Entry$/Entries$ just draws something reasonable. Could in principle anything
00337       Float_t nTP = chain->Draw("Entry$/Entries$>>htp",cutsAndWeightTruePositive,"goff");
00338       Float_t nFP = chain->Draw("Entry$/Entries$>>hfp",cutsAndWeightFalsePositive,"goff");
00339 
00340       weightsTruePositive = hTP->Integral();
00341       weightsFalsePositive = hFP->Integral();
00342 
00343       efficiency = 0;
00344       if( weightsSignal > 0 )
00345          efficiency = weightsTruePositive/weightsSignal;
00346           
00347       purity = 0;
00348       if( weightsTruePositive+weightsFalsePositive > 0 )
00349          purity = weightsTruePositive/(weightsTruePositive+weightsFalsePositive);
00350 
00351       Float_t effTimesPur = efficiency*purity;
00352 
00353       Float_t toMinimize = std::numeric_limits<float>::max(); // set to the highest existing number 
00354       if( effTimesPur > 0 ) // if larger than 0, take 1/x. This is the value to minimize
00355          toMinimize = 1./(effTimesPur); // we want to minimize 1/efficiency*purity
00356 
00357       // Print();
00358 
00359       return toMinimize;
00360    }
00361 
00362 
00363    void Print(){
00364       std::cout << std::endl;
00365       std::cout << "======================" << std::endl
00366                 << "Efficiency : " << efficiency << std::endl
00367                 << "Purity     : " << purity << std::endl << std::endl
00368                 << "True positive weights : " << weightsTruePositive << std::endl
00369                 << "False positive weights: " << weightsFalsePositive << std::endl
00370                 << "Signal weights        : " << weightsSignal << std::endl;
00371    }
00372 
00373    Float_t nSignal;
00374 
00375    Float_t efficiency;
00376    Float_t purity;
00377    Float_t weightsTruePositive;
00378    Float_t weightsFalsePositive;
00379    Float_t weightsSignal;
00380 
00381 
00382 private:
00383    TChain* chain;
00384    TH1F* hSignal;
00385    TH1F* hFP;
00386    TH1F* hTP;
00387 
00388 };
00389 
00390 
00391 
00392 
00393 
00394 
00395 
00396 
00397 // ------------------------------ call of Genetic algorithm  ----------------------------------------------------------------
00398 void MaximizeSignificance(){
00399        
00400         // define all the parameters by their minimum and maximum value
00401         // in this example 3 parameters (=cuts on the classifiers) are defined. 
00402         vector<Interval*> ranges;
00403         ranges.push_back( new Interval(-1,1) ); // for some classifiers (especially LD) the ranges have to be taken larger
00404         ranges.push_back( new Interval(-1,1) );
00405         ranges.push_back( new Interval(-1,1) );
00406 
00407         std::cout << "Classifier ranges (defined by the user)" << std::endl;
00408         for( std::vector<Interval*>::iterator it = ranges.begin(); it != ranges.end(); it++ ){
00409            std::cout << " range: " << (*it)->GetMin() << "   " << (*it)->GetMax() << std::endl;
00410         }
00411 
00412         TChain* chain = new TChain("multiBkg");
00413         chain->Add("tmva_example_multiple_backgrounds__applied.root");
00414 
00415         IFitterTarget* myFitness = new MyFitness( chain );
00416 
00417         // prepare the genetic algorithm with an initial population size of 20
00418         // mind: big population sizes will help in searching the domain space of the solution
00419         // but you have to weight this out to the number of generations
00420         // the extreme case of 1 generation and populationsize n is equal to 
00421         // a Monte Carlo calculation with n tries
00422 
00423         const TString name( "multipleBackgroundGA" );
00424         const TString opts( "PopSize=100:Steps=30" );
00425 
00426         GeneticFitter mg( *myFitness, name, ranges, opts);
00427         // mg.SetParameters( 4, 30, 200, 10,5, 0.95, 0.001 );
00428 
00429         std::vector<Double_t> result;
00430         Double_t estimator = mg.Run(result);
00431 
00432         dynamic_cast<MyFitness*>(myFitness)->Print();
00433         std::cout << std::endl;
00434 
00435         int n = 0;
00436         for( std::vector<Double_t>::iterator it = result.begin(); it<result.end(); it++ ){
00437            std::cout << "  cutValue[" << n << "] = " << (*it) << ";"<< std::endl;
00438            n++;
00439         }
00440 
00441         
00442 }
00443 
00444 
00445 } // namespace TMVA
00446 
00447 
00448 // ------------------------------ Run all ----------------------------------------------------------------
00449 int main( int argc, char** argv ) 
00450 {
00451    cout << "Start Test TMVAGAexample" << endl
00452         << "========================" << endl
00453         << endl;
00454 
00455    gROOT->ProcessLine(".L createData.C+");
00456    gROOT->ProcessLine("create_MultipleBackground(2000)");
00457 
00458 
00459    cout << endl;
00460    cout << "========================" << endl;
00461    cout << "--- Training" << endl;
00462    TMVA::Training();
00463 
00464    cout << endl;
00465    cout << "========================" << endl;
00466    cout << "--- Application & create combined tree" << endl;
00467    TMVA::ApplicationCreateCombinedTree();
00468 
00469    cout << endl;
00470    cout << "========================" << endl;
00471    cout << "--- maximize significance" << endl;
00472    TMVA::MaximizeSignificance();
00473 }

Generated on Tue Jul 5 15:26:37 2011 for ROOT_528-00b_version by  doxygen 1.5.1