00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051
00052
00053
00054
00055
00056
00057
00058
00059
00060
00061
00062
00063
00064
00065
00066
00067
00068
00069
00070
00071
00072
00073
00074
00075
00076
00077
00078
00079
00080
00081
00082
00083
00084
00085
00086
00087
00088
00089
00090
00091
00092
00093
00094
00095
00096
00097
00098
00099
00100
00101
00102
00103
00104
00105
00106 #include <algorithm>
00107
00108 #include <math.h>
00109 #include <fstream>
00110
00111 #include "Riostream.h"
00112 #include "TRandom3.h"
00113 #include "TRandom3.h"
00114 #include "TMath.h"
00115 #include "TObjString.h"
00116
00117 #include "TMVA/ClassifierFactory.h"
00118 #include "TMVA/MethodBDT.h"
00119 #include "TMVA/Tools.h"
00120 #include "TMVA/Timer.h"
00121 #include "TMVA/Ranking.h"
00122 #include "TMVA/SdivSqrtSplusB.h"
00123 #include "TMVA/BinarySearchTree.h"
00124 #include "TMVA/SeparationBase.h"
00125 #include "TMVA/GiniIndex.h"
00126 #include "TMVA/GiniIndexWithLaplace.h"
00127 #include "TMVA/CrossEntropy.h"
00128 #include "TMVA/MisClassificationError.h"
00129 #include "TMVA/Results.h"
00130 #include "TMVA/ResultsMulticlass.h"
00131 #include "TMVA/Interval.h"
00132
00133 using std::vector;
00134
00135 REGISTER_METHOD(BDT)
00136
00137 ClassImp(TMVA::MethodBDT)
00138
00139 const Int_t TMVA::MethodBDT::fgDebugLevel = 0;
00140
00141
00142 TMVA::MethodBDT::MethodBDT( const TString& jobName,
00143 const TString& methodTitle,
00144 DataSetInfo& theData,
00145 const TString& theOption,
00146 TDirectory* theTargetDir ) :
00147 TMVA::MethodBase( jobName, Types::kBDT, methodTitle, theData, theOption, theTargetDir )
00148 , fNTrees(0)
00149 , fRenormByClass(0)
00150 , fAdaBoostBeta(0)
00151 , fTransitionPoint(0)
00152 , fShrinkage(0)
00153 , fBaggedGradBoost(kFALSE)
00154 , fSampleFraction(0)
00155 , fSumOfWeights(0)
00156 , fNodeMinEvents(0)
00157 , fNCuts(0)
00158 , fUseFisherCuts(0)
00159 , fMinLinCorrForFisher(.8)
00160 , fUseExclusiveVars(0)
00161 , fUseYesNoLeaf(kFALSE)
00162 , fNodePurityLimit(0)
00163 , fUseWeightedTrees(kFALSE)
00164 , fNNodesMax(0)
00165 , fMaxDepth(0)
00166 , fPruneMethod(DecisionTree::kNoPruning)
00167 , fPruneStrength(0)
00168 , fPruneBeforeBoost(kFALSE)
00169 , fFValidationEvents(0)
00170 , fAutomatic(kFALSE)
00171 , fRandomisedTrees(kFALSE)
00172 , fUseNvars(0)
00173 , fUsePoissonNvars(0)
00174 , fUseNTrainEvents(0)
00175 , fSampleSizeFraction(0)
00176 , fNoNegWeightsInTraining(kFALSE)
00177 , fITree(0)
00178 , fBoostWeight(0)
00179 , fErrorFraction(0)
00180 {
00181
00182 fMonitorNtuple = NULL;
00183 fSepType = NULL;
00184 }
00185
00186
00187 TMVA::MethodBDT::MethodBDT( DataSetInfo& theData,
00188 const TString& theWeightFile,
00189 TDirectory* theTargetDir )
00190 : TMVA::MethodBase( Types::kBDT, theData, theWeightFile, theTargetDir )
00191 , fNTrees(0)
00192 , fRenormByClass(0)
00193 , fAdaBoostBeta(0)
00194 , fTransitionPoint(0)
00195 , fShrinkage(0)
00196 , fBaggedGradBoost(kFALSE)
00197 , fSampleFraction(0)
00198 , fSumOfWeights(0)
00199 , fNodeMinEvents(0)
00200 , fNCuts(0)
00201 , fUseFisherCuts(0)
00202 , fMinLinCorrForFisher(.8)
00203 , fUseExclusiveVars(0)
00204 , fUseYesNoLeaf(kFALSE)
00205 , fNodePurityLimit(0)
00206 , fUseWeightedTrees(kFALSE)
00207 , fNNodesMax(0)
00208 , fMaxDepth(0)
00209 , fPruneMethod(DecisionTree::kNoPruning)
00210 , fPruneStrength(0)
00211 , fPruneBeforeBoost(kFALSE)
00212 , fFValidationEvents(0)
00213 , fAutomatic(kFALSE)
00214 , fRandomisedTrees(kFALSE)
00215 , fUseNvars(0)
00216 , fUsePoissonNvars(0)
00217 , fUseNTrainEvents(0)
00218 , fSampleSizeFraction(0)
00219 , fNoNegWeightsInTraining(kFALSE)
00220 , fITree(0)
00221 , fBoostWeight(0)
00222 , fErrorFraction(0)
00223 {
00224 fMonitorNtuple = NULL;
00225 fSepType = NULL;
00226
00227
00228
00229
00230 }
00231
00232
00233 Bool_t TMVA::MethodBDT::HasAnalysisType( Types::EAnalysisType type, UInt_t numberClasses, UInt_t numberTargets )
00234 {
00235
00236 if (type == Types::kClassification && numberClasses == 2) return kTRUE;
00237 if (type == Types::kMulticlass ) return kTRUE;
00238 if( type == Types::kRegression && numberTargets == 1 ) return kTRUE;
00239 return kFALSE;
00240 }
00241
00242
00243 void TMVA::MethodBDT::DeclareOptions()
00244 {
00245
00246
00247
00248
00249
00250
00251
00252
00253
00254
00255
00256
00257
00258
00259
00260
00261
00262
00263
00264
00265
00266
00267
00268
00269
00270
00271
00272
00273
00274
00275
00276
00277
00278
00279
00280
00281
00282
00283
00284 DeclareOptionRef(fNTrees, "NTrees", "Number of trees in the forest");
00285 DeclareOptionRef(fRenormByClass=kFALSE,"RenormByClass","individually re-normalize each event class to the original size after boosting");
00286 DeclareOptionRef(fBoostType, "BoostType", "Boosting type for the trees in the forest");
00287 AddPreDefVal(TString("AdaBoost"));
00288 AddPreDefVal(TString("Bagging"));
00289 AddPreDefVal(TString("RegBoost"));
00290 AddPreDefVal(TString("AdaBoostR2"));
00291 AddPreDefVal(TString("Grad"));
00292 if (DoRegression()) {
00293 fBoostType = "AdaBoostR2";
00294 }else{
00295 fBoostType = "AdaBoost";
00296 }
00297 DeclareOptionRef(fAdaBoostR2Loss="Quadratic", "AdaBoostR2Loss", "Type of Loss function in AdaBoostR2t (Linear,Quadratic or Exponential)");
00298 AddPreDefVal(TString("Linear"));
00299 AddPreDefVal(TString("Quadratic"));
00300 AddPreDefVal(TString("Exponential"));
00301
00302 DeclareOptionRef(fBaggedGradBoost=kFALSE, "UseBaggedGrad","Use only a random subsample of all events for growing the trees in each iteration. (Only valid for GradBoost)");
00303 DeclareOptionRef(fSampleFraction=0.6, "GradBaggingFraction","Defines the fraction of events to be used in each iteration when UseBaggedGrad=kTRUE. (Only valid for GradBoost)");
00304 DeclareOptionRef(fShrinkage=1.0, "Shrinkage", "Learning rate for GradBoost algorithm");
00305 DeclareOptionRef(fAdaBoostBeta=1.0, "AdaBoostBeta", "Parameter for AdaBoost algorithm");
00306 DeclareOptionRef(fRandomisedTrees,"UseRandomisedTrees","Choose at each node splitting a random set of variables");
00307 DeclareOptionRef(fUseNvars,"UseNvars","Number of variables used if randomised tree option is chosen");
00308 DeclareOptionRef(fUsePoissonNvars,"UsePoissonNvars", "use *UseNvars* not as fixed number but as mean of a possion distr. in each split");
00309 DeclareOptionRef(fUseNTrainEvents,"UseNTrainEvents","number of randomly picked training events used in randomised (and bagged) trees");
00310
00311 DeclareOptionRef(fUseWeightedTrees=kTRUE, "UseWeightedTrees",
00312 "Use weighted trees or simple average in classification from the forest");
00313 DeclareOptionRef(fUseYesNoLeaf=kTRUE, "UseYesNoLeaf",
00314 "Use Sig or Bkg categories, or the purity=S/(S+B) as classification of the leaf node");
00315 if (DoRegression()) {
00316 fUseYesNoLeaf = kFALSE;
00317 }
00318
00319
00320 DeclareOptionRef(fNodePurityLimit=0.5, "NodePurityLimit", "In boosting/pruning, nodes with purity > NodePurityLimit are signal; background otherwise.");
00321 DeclareOptionRef(fSepTypeS, "SeparationType", "Separation criterion for node splitting");
00322 AddPreDefVal(TString("CrossEntropy"));
00323 AddPreDefVal(TString("GiniIndex"));
00324 AddPreDefVal(TString("GiniIndexWithLaplace"));
00325 AddPreDefVal(TString("MisClassificationError"));
00326 AddPreDefVal(TString("SDivSqrtSPlusB"));
00327 AddPreDefVal(TString("RegressionVariance"));
00328 if (DoRegression()) {
00329 fSepTypeS = "RegressionVariance";
00330 }else{
00331 fSepTypeS = "GiniIndex";
00332 }
00333 DeclareOptionRef(fNodeMinEvents, "nEventsMin", "Minimum number of events required in a leaf node (default Classification: max(40, N_train/(Nvar^2)/10) ) Regression: 10");
00334 DeclareOptionRef(fNCuts, "nCuts", "Number of steps during node cut optimisation");
00335 DeclareOptionRef(fUseFisherCuts=kFALSE, "UseFisherCuts", "use multivariate splits using the Fisher criterium");
00336 DeclareOptionRef(fMinLinCorrForFisher=.8,"MinLinCorrForFisher", "the minimum linear correlation between two variables demanded for use in fisher criterium in node splitting");
00337 DeclareOptionRef(fUseExclusiveVars=kFALSE,"UseExclusiveVars","individual variables already used in fisher criterium are not anymore analysed individually for node splitting");
00338
00339 DeclareOptionRef(fPruneStrength, "PruneStrength", "Pruning strength");
00340 DeclareOptionRef(fPruneMethodS, "PruneMethod", "Method used for pruning (removal) of statistically insignificant branches");
00341 AddPreDefVal(TString("NoPruning"));
00342 AddPreDefVal(TString("ExpectedError"));
00343 AddPreDefVal(TString("CostComplexity"));
00344 DeclareOptionRef(fPruneBeforeBoost=kFALSE, "PruneBeforeBoost", "Flag to prune the tree before applying boosting algorithm");
00345 DeclareOptionRef(fFValidationEvents=0.5, "PruningValFraction", "Fraction of events to use for optimizing automatic pruning.");
00346 DeclareOptionRef(fNNodesMax=100000,"NNodesMax","Max number of nodes in tree");
00347 if (DoRegression()) {
00348 DeclareOptionRef(fMaxDepth=50,"MaxDepth","Max depth of the decision tree allowed");
00349 }else{
00350 DeclareOptionRef(fMaxDepth=3,"MaxDepth","Max depth of the decision tree allowed");
00351 }
00352 }
00353
00354 void TMVA::MethodBDT::DeclareCompatibilityOptions() {
00355 MethodBase::DeclareCompatibilityOptions();
00356 DeclareOptionRef(fSampleSizeFraction=1.0,"SampleSizeFraction","Relative size of bagged event sample to original size of the data sample" );
00357 DeclareOptionRef(fNoNegWeightsInTraining,"NoNegWeightsInTraining","Ignore negative event weights in the training process" );
00358 }
00359
00360
00361
00362
00363
00364 void TMVA::MethodBDT::ProcessOptions()
00365 {
00366
00367
00368 fSepTypeS.ToLower();
00369 if (fSepTypeS == "misclassificationerror") fSepType = new MisClassificationError();
00370 else if (fSepTypeS == "giniindex") fSepType = new GiniIndex();
00371 else if (fSepTypeS == "giniindexwithlaplace") fSepType = new GiniIndexWithLaplace();
00372 else if (fSepTypeS == "crossentropy") fSepType = new CrossEntropy();
00373 else if (fSepTypeS == "sdivsqrtsplusb") fSepType = new SdivSqrtSplusB();
00374 else if (fSepTypeS == "regressionvariance") fSepType = NULL;
00375 else {
00376 Log() << kINFO << GetOptions() << Endl;
00377 Log() << kFATAL << "<ProcessOptions> unknown Separation Index option " << fSepTypeS << " called" << Endl;
00378 }
00379
00380 fPruneMethodS.ToLower();
00381 if (fPruneMethodS == "expectederror") fPruneMethod = DecisionTree::kExpectedErrorPruning;
00382 else if (fPruneMethodS == "costcomplexity") fPruneMethod = DecisionTree::kCostComplexityPruning;
00383 else if (fPruneMethodS == "nopruning") fPruneMethod = DecisionTree::kNoPruning;
00384 else {
00385 Log() << kINFO << GetOptions() << Endl;
00386 Log() << kFATAL << "<ProcessOptions> unknown PruneMethod " << fPruneMethodS << " option called" << Endl;
00387 }
00388 if (fPruneStrength < 0 && (fPruneMethod != DecisionTree::kNoPruning) && fBoostType!="Grad") fAutomatic = kTRUE;
00389 else fAutomatic = kFALSE;
00390 if (fAutomatic && fPruneMethod==DecisionTree::kExpectedErrorPruning){
00391 Log() << kFATAL
00392 << "Sorry autmoatic pruning strength determination is not implemented yet for ExpectedErrorPruning" << Endl;
00393 }
00394 fAdaBoostR2Loss.ToLower();
00395
00396 if (fBoostType!="Grad") fBaggedGradBoost=kFALSE;
00397 else fPruneMethod = DecisionTree::kNoPruning;
00398 if (fFValidationEvents < 0.0) fFValidationEvents = 0.0;
00399 if (fAutomatic && fFValidationEvents > 0.5) {
00400 Log() << kWARNING << "You have chosen to use more than half of your training sample "
00401 << "to optimize the automatic pruning algorithm. This is probably wasteful "
00402 << "and your overall results will be degraded. Are you sure you want this?"
00403 << Endl;
00404 }
00405
00406
00407 if (this->Data()->HasNegativeEventWeights()){
00408 Log() << kINFO << " You are using a Monte Carlo that has also negative weights. "
00409 << "That should in principle be fine as long as on average you end up with "
00410 << "something positive. For this you have to make sure that the minimal number "
00411 << "of (un-weighted) events demanded for a tree node (currently you use: nEventsMin="
00412 <<fNodeMinEvents<<", you can set this via the BDT option string when booking the "
00413 << "classifier) is large enough to allow for reasonable averaging!!! "
00414 << " If this does not help.. maybe you want to try the option: IgnoreNegWeightsInTraining "
00415 << "which ignores events with negative weight in the training. " << Endl
00416 << Endl << "Note: You'll get a WARNING message during the training if that should ever happen" << Endl;
00417 }
00418
00419 if (DoRegression()) {
00420 if (fUseYesNoLeaf && !IsConstructedFromWeightFile()){
00421 Log() << kWARNING << "Regression Trees do not work with fUseYesNoLeaf=TRUE --> I will set it to FALSE" << Endl;
00422 fUseYesNoLeaf = kFALSE;
00423 }
00424
00425 if (fSepType != NULL){
00426 Log() << kWARNING << "Regression Trees do not work with Separation type other than <RegressionVariance> --> I will use it instead" << Endl;
00427 fSepType = NULL;
00428 }
00429 }
00430 if (fRandomisedTrees){
00431 Log() << kINFO << " Randomised trees use no pruning" << Endl;
00432 fPruneMethod = DecisionTree::kNoPruning;
00433
00434 }
00435
00436
00437
00438
00439
00440
00441
00442 if (fNTrees==0){
00443 Log() << kERROR << " Zero Decision Trees demanded... that does not work !! "
00444 << " I set it to 1 .. just so that the program does not crash"
00445 << Endl;
00446 fNTrees = 1;
00447 }
00448
00449 }
00450
00451 void TMVA::MethodBDT::Init( void )
00452 {
00453
00454
00455 fNTrees = 400;
00456 if (fAnalysisType == Types::kClassification || fAnalysisType == Types::kMulticlass ) {
00457 fMaxDepth = 3;
00458 fBoostType = "AdaBoost";
00459 if(DataInfo().GetNClasses()!=0)
00460 fNodeMinEvents = TMath::Max( Int_t(40), Int_t( Data()->GetNTrainingEvents() / (10*GetNvar()*GetNvar())) );
00461 }else {
00462 fMaxDepth = 50;
00463 fBoostType = "AdaBoostR2";
00464 fAdaBoostR2Loss = "Quadratic";
00465 if(DataInfo().GetNClasses()!=0)
00466 fNodeMinEvents = 10;
00467 }
00468
00469 fNCuts = 20;
00470 fPruneMethodS = "NoPruning";
00471 fPruneMethod = DecisionTree::kNoPruning;
00472 fPruneStrength = 0;
00473 fAutomatic = kFALSE;
00474 fFValidationEvents = 0.5;
00475 fRandomisedTrees = kFALSE;
00476
00477 fUseNvars = UInt_t(TMath::Sqrt(GetNvar())+0.6);
00478 fUsePoissonNvars = kTRUE;
00479 if(DataInfo().GetNClasses()!=0)
00480 fUseNTrainEvents = Data()->GetNTrainingEvents();
00481 fNNodesMax = 1000000;
00482 fShrinkage = 1.0;
00483 fSumOfWeights = 0.0;
00484
00485
00486 SetSignalReferenceCut( 0 );
00487
00488 }
00489
00490
00491
00492 void TMVA::MethodBDT::Reset( void )
00493 {
00494
00495
00496
00497
00498
00499
00500 for (UInt_t i=0; i<fForest.size(); i++) delete fForest[i];
00501 fForest.clear();
00502
00503 fBoostWeights.clear();
00504 if (fMonitorNtuple) fMonitorNtuple->Delete(); fMonitorNtuple=NULL;
00505 fVariableImportance.clear();
00506 fResiduals.clear();
00507
00508
00509
00510 if (Data()) Data()->DeleteResults(GetMethodName(), Types::kTraining, GetAnalysisType());
00511 Log() << kDEBUG << " successfully(?) resetted the method " << Endl;
00512 }
00513
00514
00515
00516 TMVA::MethodBDT::~MethodBDT( void )
00517 {
00518
00519 for (UInt_t i=0; i<fEventSample.size(); i++) delete fEventSample[i];
00520 for (UInt_t i=0; i<fValidationSample.size(); i++) delete fValidationSample[i];
00521 for (UInt_t i=0; i<fForest.size(); i++) delete fForest[i];
00522 }
00523
00524
00525 void TMVA::MethodBDT::InitEventSample( void )
00526 {
00527
00528
00529
00530 if (!HasTrainingTree()) Log() << kFATAL << "<Init> Data().TrainingTree() is zero pointer" << Endl;
00531
00532 if (fEventSample.size() > 0) {
00533
00534 for (UInt_t iev=0; iev<fEventSample.size(); iev++) fEventSample[iev]->SetBoostWeight(1.);
00535 } else {
00536
00537 UInt_t nevents = Data()->GetNTrainingEvents();
00538 Bool_t first=kTRUE;
00539
00540 for (UInt_t ievt=0; ievt<nevents; ievt++) {
00541
00542 Event* event = new Event( *GetTrainingEvent(ievt) );
00543
00544 if (!IgnoreEventsWithNegWeightsInTraining() || event->GetWeight() > 0) {
00545 if (first && event->GetWeight() < 0) {
00546 first = kFALSE;
00547 Log() << kWARNING << "Events with negative event weights are USED during "
00548 << "the BDT training. This might cause problems with small node sizes "
00549 << "or with the boosting. Please remove negative events from training "
00550 << "using the option *IgnoreEventsWithNegWeightsInTraining* in case you "
00551 << "observe problems with the boosting"
00552 << Endl;
00553 }
00554
00555 if (fAutomatic) {
00556 Double_t modulo = 1.0/(fFValidationEvents);
00557 Int_t imodulo = static_cast<Int_t>( fmod(modulo,1.0) > 0.5 ? ceil(modulo) : floor(modulo) );
00558 if (ievt % imodulo == 0) fValidationSample.push_back( event );
00559 else fEventSample.push_back( event );
00560 }
00561 else {
00562 fEventSample.push_back(event);
00563 }
00564 } else {
00565 delete event;
00566 }
00567 if (fAutomatic) {
00568 Log() << kINFO << "<InitEventSample> Internally I use " << fEventSample.size()
00569 << " for Training and " << fValidationSample.size()
00570 << " for Pruning Validation (" << ((Float_t)fValidationSample.size())/((Float_t)fEventSample.size()+fValidationSample.size())*100.0
00571 << "% of training used for validation)" << Endl;
00572 }
00573 }
00574 }
00575 }
00576
00577
00578 std::map<TString,Double_t> TMVA::MethodBDT::OptimizeTuningParameters(TString fomType, TString fitType)
00579 {
00580
00581
00582
00583
00584 std::map<TString,TMVA::Interval> tuneParameters;
00585 std::map<TString,Double_t> tunedParameters;
00586
00587
00588
00589
00590
00591
00592
00593
00594 Int_t N = Int_t( Data()->GetNEvtSigTrain()) ;
00595 Int_t min = TMath::Max( 20, ( ( N/10000 - (N/10000)%10) ) );
00596 Int_t max = TMath::Max( min*10, TMath::Min( 10000, ( ( N/10 - (N/10) %100) ) ) );
00597
00598 tuneParameters.insert(std::pair<TString,Interval>("MaxDepth", Interval(1,10,10)));
00599 tuneParameters.insert(std::pair<TString,Interval>("NodeMinEvents", Interval(min,max,10)));
00600 tuneParameters.insert(std::pair<TString,Interval>("NTrees", Interval(50,1000,20)));
00601
00602
00603
00604 Log() << kINFO << "Automatic optimisation of tuning parameters in BDT uses:" << Endl;
00605
00606 std::map<TString,TMVA::Interval>::iterator it;
00607
00608 for (it=tuneParameters.begin(); it!=tuneParameters.end();it++) {
00609 Log() << kINFO << it->first
00610 << " in range from: " << it->second.GetMin()
00611 << " to: " << it->second.GetMax()
00612 << " in : " << it->second.GetNbins() << " steps"
00613 << Endl;
00614 }
00615 Log() << kINFO << " using the options: " << fomType << " and " << fitType << Endl;
00616 OptimizeConfigParameters optimize(this, tuneParameters, fomType, fitType);
00617 tunedParameters=optimize.optimize();
00618
00619 return tunedParameters;
00620
00621 }
00622
00623
00624 void TMVA::MethodBDT::SetTuneParameters(std::map<TString,Double_t> tuneParameters)
00625 {
00626
00627
00628 std::map<TString,Double_t>::iterator it;
00629 for(it=tuneParameters.begin(); it!= tuneParameters.end(); it++){
00630 if (it->first == "MaxDepth" ) SetMaxDepth ((Int_t)it->second);
00631 if (it->first == "NodeMinEvents" ) SetNodeMinEvents ((Int_t)it->second);
00632 if (it->first == "NTrees" ) SetNTrees ((Int_t)it->second);
00633 if (it->first == "NodePurityLimit") SetNodePurityLimit (it->second);
00634 if (it->first == "AdaBoostBeta" ) SetAdaBoostBeta (it->second);
00635 }
00636
00637 }
00638
00639
00640 void TMVA::MethodBDT::Train()
00641 {
00642
00643 TMVA::DecisionTreeNode::fgIsTraining=true;
00644
00645
00646
00647
00648 InitEventSample();
00649
00650 if (fNTrees==0){
00651 Log() << kERROR << " Zero Decision Trees demanded... that does not work !! "
00652 << " I set it to 1 .. just so that the program does not crash"
00653 << Endl;
00654 fNTrees = 1;
00655 }
00656
00657
00658
00659 if (IsNormalised()) Log() << kFATAL << "\"Normalise\" option cannot be used with BDT; "
00660 << "please remove the option from the configuration string, or "
00661 << "use \"!Normalise\""
00662 << Endl;
00663
00664 Log() << kINFO << "Training "<< fNTrees << " Decision Trees ... patience please" << Endl;
00665
00666 Log() << kDEBUG << "Training with maximal depth = " <<fMaxDepth
00667 << ", NodeMinEvents=" << fNodeMinEvents
00668 << ", NTrees="<<fNTrees
00669 << ", NodePurityLimit="<<fNodePurityLimit
00670 << ", AdaBoostBeta="<<fAdaBoostBeta
00671 << Endl;
00672
00673
00674 Int_t nBins;
00675 Double_t xMin,xMax;
00676 TString hname = "AdaBooost weight distribution";
00677
00678 nBins= 100;
00679 xMin = 0;
00680 xMax = 30;
00681
00682 if (DoRegression()) {
00683 nBins= 100;
00684 xMin = 0;
00685 xMax = 1;
00686 hname="Boost event weights distribution";
00687 }
00688
00689
00690
00691 TH1* h = new TH1F("BoostWeight",hname,nBins,xMin,xMax);
00692 TH1* nodesBeforePruningVsTree = new TH1I("NodesBeforePruning","nodes before pruning",fNTrees,0,fNTrees);
00693 TH1* nodesAfterPruningVsTree = new TH1I("NodesAfterPruning","nodes after pruning",fNTrees,0,fNTrees);
00694
00695 if(!DoMulticlass()){
00696 Results* results = Data()->GetResults(GetMethodName(), Types::kTraining, GetAnalysisType());
00697
00698 h->SetXTitle("boost weight");
00699 results->Store(h, "BoostWeights");
00700
00701
00702 h = new TH1F("BoostWeightVsTree","Boost weights vs tree",fNTrees,0,fNTrees);
00703 h->SetXTitle("#tree");
00704 h->SetYTitle("boost weight");
00705 results->Store(h, "BoostWeightsVsTree");
00706
00707
00708 h = new TH1F("ErrFractHist","error fraction vs tree number",fNTrees,0,fNTrees);
00709 h->SetXTitle("#tree");
00710 h->SetYTitle("error fraction");
00711 results->Store(h, "ErrorFrac");
00712
00713
00714 nodesBeforePruningVsTree->SetXTitle("#tree");
00715 nodesBeforePruningVsTree->SetYTitle("#tree nodes");
00716 results->Store(nodesBeforePruningVsTree);
00717
00718
00719 nodesAfterPruningVsTree->SetXTitle("#tree");
00720 nodesAfterPruningVsTree->SetYTitle("#tree nodes");
00721 results->Store(nodesAfterPruningVsTree);
00722
00723 }
00724
00725 fMonitorNtuple= new TTree("MonitorNtuple","BDT variables");
00726 fMonitorNtuple->Branch("iTree",&fITree,"iTree/I");
00727 fMonitorNtuple->Branch("boostWeight",&fBoostWeight,"boostWeight/D");
00728 fMonitorNtuple->Branch("errorFraction",&fErrorFraction,"errorFraction/D");
00729
00730 Timer timer( fNTrees, GetName() );
00731 Int_t nNodesBeforePruningCount = 0;
00732 Int_t nNodesAfterPruningCount = 0;
00733
00734 Int_t nNodesBeforePruning = 0;
00735 Int_t nNodesAfterPruning = 0;
00736
00737 if(fBoostType=="Grad"){
00738 InitGradBoost(fEventSample);
00739 }
00740
00741 for (int itree=0; itree<fNTrees; itree++) {
00742 timer.DrawProgressBar( itree );
00743 if(DoMulticlass()){
00744 if (fBoostType!="Grad"){
00745 Log() << kFATAL << "Multiclass is currently only supported by gradient boost. "
00746 << "Please change boost option accordingly (GradBoost)."
00747 << Endl;
00748 }
00749 UInt_t nClasses = DataInfo().GetNClasses();
00750 for (UInt_t i=0;i<nClasses;i++){
00751 fForest.push_back( new DecisionTree( fSepType, fNodeMinEvents, fNCuts, i,
00752 fRandomisedTrees, fUseNvars, fUsePoissonNvars, fNNodesMax, fMaxDepth,
00753 itree*nClasses+i, fNodePurityLimit, itree*nClasses+i));
00754 if (fUseFisherCuts) {
00755 fForest.back()->SetUseFisherCuts();
00756 fForest.back()->SetMinLinCorrForFisher(fMinLinCorrForFisher);
00757 fForest.back()->SetUseExclusiveVars(fUseExclusiveVars);
00758 }
00759
00760
00761 if (fBaggedGradBoost){
00762 nNodesBeforePruning = fForest.back()->BuildTree(fSubSample);
00763 fBoostWeights.push_back(this->Boost(fSubSample, fForest.back(), itree, i));
00764 }
00765 else{
00766 nNodesBeforePruning = fForest.back()->BuildTree(fEventSample);
00767 fBoostWeights.push_back(this->Boost(fEventSample, fForest.back(), itree, i));
00768 }
00769 }
00770 }
00771 else{
00772
00773 fForest.push_back( new DecisionTree( fSepType, fNodeMinEvents, fNCuts, 0,
00774 fRandomisedTrees, fUseNvars, fUsePoissonNvars, fNNodesMax, fMaxDepth,
00775 itree, fNodePurityLimit, itree));
00776 if (fUseFisherCuts) {
00777 fForest.back()->SetUseFisherCuts();
00778 fForest.back()->SetMinLinCorrForFisher(fMinLinCorrForFisher);
00779 fForest.back()->SetUseExclusiveVars(fUseExclusiveVars);
00780 }
00781 if (fBaggedGradBoost) nNodesBeforePruning = fForest.back()->BuildTree(fSubSample);
00782 else nNodesBeforePruning = fForest.back()->BuildTree(fEventSample);
00783
00784 if (fBoostType!="Grad")
00785 if (fUseYesNoLeaf && !DoRegression() ){
00786 nNodesBeforePruning = fForest.back()->CleanTree();
00787 }
00788 nNodesBeforePruningCount += nNodesBeforePruning;
00789 nodesBeforePruningVsTree->SetBinContent(itree+1,nNodesBeforePruning);
00790
00791 fForest.back()->SetPruneMethod(fPruneMethod);
00792 fForest.back()->SetPruneStrength(fPruneStrength);
00793
00794 std::vector<Event*> * validationSample = NULL;
00795 if(fAutomatic) validationSample = &fValidationSample;
00796
00797 if(fBoostType=="Grad"){
00798 if(fBaggedGradBoost)
00799 fBoostWeights.push_back(this->Boost(fSubSample, fForest.back(), itree));
00800 else
00801 fBoostWeights.push_back(this->Boost(fEventSample, fForest.back(), itree));
00802 }
00803 else {
00804 if(!fPruneBeforeBoost) {
00805 fBoostWeights.push_back( this->Boost(fEventSample, fForest.back(), itree) );
00806
00807
00808
00809 fForest.back()->PruneTree(validationSample);
00810 }
00811 else {
00812 fForest.back()->PruneTree(validationSample);
00813 fBoostWeights.push_back( this->Boost(fEventSample, fForest.back(), itree) );
00814 }
00815
00816 if (fUseYesNoLeaf && !DoRegression() ){
00817 fForest.back()->CleanTree();
00818 }
00819 }
00820 nNodesAfterPruning = fForest.back()->GetNNodes();
00821 nNodesAfterPruningCount += nNodesAfterPruning;
00822 nodesAfterPruningVsTree->SetBinContent(itree+1,nNodesAfterPruning);
00823
00824 fITree = itree;
00825 fMonitorNtuple->Fill();
00826 }
00827 }
00828
00829
00830 Log() << kINFO << "<Train> elapsed time: " << timer.GetElapsedTime()
00831 << " " << Endl;
00832 if (fPruneMethod == DecisionTree::kNoPruning) {
00833 Log() << kINFO << "<Train> average number of nodes (w/o pruning) : "
00834 << nNodesBeforePruningCount/fNTrees << Endl;
00835 }
00836 else {
00837 Log() << kINFO << "<Train> average number of nodes before/after pruning : "
00838 << nNodesBeforePruningCount/fNTrees << " / "
00839 << nNodesAfterPruningCount/fNTrees
00840 << Endl;
00841 }
00842 TMVA::DecisionTreeNode::fgIsTraining=false;
00843 }
00844
00845
00846 void TMVA::MethodBDT::GetRandomSubSample()
00847 {
00848
00849 UInt_t nevents = fEventSample.size();
00850
00851 if (fSubSample.size()!=0) fSubSample.clear();
00852 TRandom3 *trandom = new TRandom3(fForest.size()+1);
00853
00854 for (UInt_t ievt=0; ievt<nevents; ievt++) {
00855 if(trandom->Rndm()<fSampleFraction)
00856 fSubSample.push_back(fEventSample[ievt]);
00857 }
00858 }
00859
00860
00861 Double_t TMVA::MethodBDT::GetGradBoostMVA(TMVA::Event& e, UInt_t nTrees)
00862 {
00863
00864 Double_t sum=0;
00865 for (UInt_t itree=0; itree<nTrees; itree++) {
00866
00867 sum += fForest[itree]->CheckEvent(e,kFALSE);
00868
00869 }
00870 return 2.0/(1.0+exp(-2.0*sum))-1;
00871 }
00872
00873
00874 void TMVA::MethodBDT::UpdateTargets(vector<TMVA::Event*> eventSample, UInt_t cls)
00875 {
00876
00877
00878 if(DoMulticlass()){
00879 UInt_t nClasses = DataInfo().GetNClasses();
00880 for (vector<TMVA::Event*>::iterator e=eventSample.begin(); e!=eventSample.end();e++) {
00881 fResiduals[*e].at(cls)+=fForest.back()->CheckEvent(*(*e),kFALSE);
00882 if(cls == nClasses-1){
00883 for(UInt_t i=0;i<nClasses;i++){
00884 Double_t norm = 0.0;
00885 for(UInt_t j=0;j<nClasses;j++){
00886 if(i!=j)
00887 norm+=exp(fResiduals[*e].at(j)-fResiduals[*e].at(i));
00888 }
00889 Double_t p_cls = 1.0/(1.0+norm);
00890 Double_t res = ((*e)->GetClass()==i)?(1.0-p_cls):(-p_cls);
00891 (*e)->SetTarget(i,res);
00892 }
00893 }
00894 }
00895 }
00896 else{
00897 for (vector<TMVA::Event*>::iterator e=eventSample.begin(); e!=eventSample.end();e++) {
00898 fResiduals[*e].at(0)+=fForest.back()->CheckEvent(*(*e),kFALSE);
00899 Double_t p_sig=1.0/(1.0+exp(-2.0*fResiduals[*e].at(0)));
00900 Double_t res = (DataInfo().IsSignal(*e)?1:0)-p_sig;
00901 (*e)->SetTarget(0,res);
00902 }
00903 }
00904 }
00905
00906
00907 void TMVA::MethodBDT::UpdateTargetsRegression(vector<TMVA::Event*> eventSample, Bool_t first)
00908 {
00909
00910 for (vector<TMVA::Event*>::iterator e=fEventSample.begin(); e!=fEventSample.end();e++) {
00911 if(!first){
00912 fWeightedResiduals[*e].first -= fForest.back()->CheckEvent(*(*e),kFALSE);
00913 }
00914
00915 }
00916
00917 fSumOfWeights = 0;
00918 vector< pair<Double_t, Double_t> > temp;
00919 for (vector<TMVA::Event*>::iterator e=eventSample.begin(); e!=eventSample.end();e++){
00920 temp.push_back(make_pair(fabs(fWeightedResiduals[*e].first),fWeightedResiduals[*e].second));
00921 fSumOfWeights += (*e)->GetWeight();
00922 }
00923 fTransitionPoint = GetWeightedQuantile(temp,0.7,fSumOfWeights);
00924
00925 Int_t i=0;
00926 for (vector<TMVA::Event*>::iterator e=eventSample.begin(); e!=eventSample.end();e++) {
00927
00928 if(temp[i].first<=fTransitionPoint)
00929 (*e)->SetTarget(0,fWeightedResiduals[*e].first);
00930 else
00931 (*e)->SetTarget(0,fTransitionPoint*(fWeightedResiduals[*e].first<0?-1.0:1.0));
00932 i++;
00933 }
00934 }
00935
00936
00937 Double_t TMVA::MethodBDT::GetWeightedQuantile(vector< pair<Double_t, Double_t> > vec, const Double_t quantile, const Double_t norm){
00938
00939 Double_t temp = 0.0;
00940 std::sort(vec.begin(), vec.end());
00941 Int_t i = 0;
00942 while(temp <= norm*quantile){
00943 temp += vec[i].second;
00944 i++;
00945 }
00946
00947 return vec[i].first;
00948 }
00949
00950
00951 Double_t TMVA::MethodBDT::GradBoost( vector<TMVA::Event*> eventSample, DecisionTree *dt, UInt_t cls)
00952 {
00953
00954 std::map<TMVA::DecisionTreeNode*,vector<Double_t> > leaves;
00955 for (vector<TMVA::Event*>::iterator e=eventSample.begin(); e!=eventSample.end();e++) {
00956 Double_t weight = (*e)->GetWeight();
00957 TMVA::DecisionTreeNode* node = dt->GetEventNode(*(*e));
00958 if ((leaves[node]).size()==0){
00959 (leaves[node]).push_back((*e)->GetTarget(cls)* weight);
00960 (leaves[node]).push_back(fabs((*e)->GetTarget(cls))*(1.0-fabs((*e)->GetTarget(cls))) * weight* weight);
00961 }
00962 else {
00963 (leaves[node])[0]+=((*e)->GetTarget(cls)* weight);
00964 (leaves[node])[1]+=fabs((*e)->GetTarget(cls))*(1.0-fabs((*e)->GetTarget(cls))) * weight* weight;
00965 }
00966 }
00967 for (std::map<TMVA::DecisionTreeNode*,vector<Double_t> >::iterator iLeave=leaves.begin();
00968 iLeave!=leaves.end();++iLeave){
00969 if ((iLeave->second)[1]<1e-30) (iLeave->second)[1]=1e-30;
00970
00971 (iLeave->first)->SetResponse(fShrinkage/DataInfo().GetNClasses()*(iLeave->second)[0]/((iLeave->second)[1]));
00972 }
00973
00974
00975 if (fBaggedGradBoost){
00976 GetRandomSubSample();
00977 }
00978 DoMulticlass() ? UpdateTargets(fEventSample, cls) : UpdateTargets(fEventSample);
00979 return 1;
00980 }
00981
00982
00983 Double_t TMVA::MethodBDT::GradBoostRegression( vector<TMVA::Event*> eventSample, DecisionTree *dt )
00984 {
00985
00986 std::map<TMVA::DecisionTreeNode*,Double_t > leaveWeights;
00987 std::map<TMVA::DecisionTreeNode*,vector< pair<Double_t, Double_t> > > leaves;
00988 UInt_t i =0;
00989 for (vector<TMVA::Event*>::iterator e=eventSample.begin(); e!=eventSample.end();e++) {
00990 TMVA::DecisionTreeNode* node = dt->GetEventNode(*(*e));
00991 (leaves[node]).push_back(make_pair(fWeightedResiduals[*e].first,(*e)->GetWeight()));
00992 (leaveWeights[node]) += (*e)->GetWeight();
00993 i++;
00994 }
00995
00996 for (std::map<TMVA::DecisionTreeNode*,vector< pair<Double_t, Double_t> > >::iterator iLeave=leaves.begin();
00997 iLeave!=leaves.end();++iLeave){
00998 Double_t shift=0,diff= 0;
00999 Double_t ResidualMedian = GetWeightedQuantile(iLeave->second,0.5,leaveWeights[iLeave->first]);
01000 for(UInt_t j=0;j<((iLeave->second).size());j++){
01001 diff = (iLeave->second)[j].first-ResidualMedian;
01002 shift+=1.0/((iLeave->second).size())*((diff<0)?-1.0:1.0)*TMath::Min(fTransitionPoint,fabs(diff));
01003 }
01004 (iLeave->first)->SetResponse(fShrinkage*(ResidualMedian+shift));
01005 }
01006
01007 if (fBaggedGradBoost){
01008 GetRandomSubSample();
01009 UpdateTargetsRegression(fSubSample);
01010 }
01011 else
01012 UpdateTargetsRegression(fEventSample);
01013 return 1;
01014 }
01015
01016
01017 void TMVA::MethodBDT::InitGradBoost( vector<TMVA::Event*> eventSample)
01018 {
01019
01020 fSumOfWeights = 0;
01021 fSepType=NULL;
01022 std::vector<std::pair<Double_t, Double_t> > temp;
01023 if(DoRegression()){
01024 for (vector<TMVA::Event*>::iterator e=eventSample.begin(); e!=eventSample.end();e++) {
01025 fWeightedResiduals[*e]= make_pair((*e)->GetTarget(0), (*e)->GetWeight());
01026 fSumOfWeights+=(*e)->GetWeight();
01027 temp.push_back(make_pair(fWeightedResiduals[*e].first,fWeightedResiduals[*e].second));
01028 }
01029 Double_t weightedMedian = GetWeightedQuantile(temp,0.5, fSumOfWeights);
01030
01031
01032 fBoostWeights.push_back(weightedMedian);
01033 std::map<TMVA::Event*, std::pair<Double_t, Double_t> >::iterator res = fWeightedResiduals.begin();
01034 for (; res!=fWeightedResiduals.end(); ++res ) {
01035
01036 (*res).second.first -= weightedMedian;
01037 }
01038 if (fBaggedGradBoost){
01039 GetRandomSubSample();
01040 UpdateTargetsRegression(fSubSample,kTRUE);
01041 }
01042 else
01043 UpdateTargetsRegression(fEventSample,kTRUE);
01044 return;
01045 }
01046 else if(DoMulticlass()){
01047 UInt_t nClasses = DataInfo().GetNClasses();
01048 for (vector<TMVA::Event*>::iterator e=eventSample.begin(); e!=eventSample.end();e++) {
01049 for (UInt_t i=0;i<nClasses;i++){
01050
01051 Double_t r = (*e)->GetClass()==i?(1-1.0/nClasses):(-1.0/nClasses);
01052 (*e)->SetTarget(i,r);
01053 fResiduals[*e].push_back(0);
01054 }
01055 }
01056 }
01057 else{
01058 for (vector<TMVA::Event*>::iterator e=eventSample.begin(); e!=eventSample.end();e++) {
01059 Double_t r = (DataInfo().IsSignal(*e)?1:0)-0.5;
01060 (*e)->SetTarget(0,r);
01061 fResiduals[*e].push_back(0);
01062 }
01063 }
01064 if (fBaggedGradBoost) GetRandomSubSample();
01065 }
01066
01067 Double_t TMVA::MethodBDT::TestTreeQuality( DecisionTree *dt )
01068 {
01069
01070
01071 Double_t ncorrect=0, nfalse=0;
01072 for (UInt_t ievt=0; ievt<fValidationSample.size(); ievt++) {
01073 Bool_t isSignalType= (dt->CheckEvent(*(fValidationSample[ievt])) > fNodePurityLimit ) ? 1 : 0;
01074
01075 if (isSignalType == (DataInfo().IsSignal(fValidationSample[ievt])) ) {
01076 ncorrect += fValidationSample[ievt]->GetWeight();
01077 }
01078 else{
01079 nfalse += fValidationSample[ievt]->GetWeight();
01080 }
01081 }
01082
01083 return ncorrect / (ncorrect + nfalse);
01084 }
01085
01086
01087 Double_t TMVA::MethodBDT::Boost( vector<TMVA::Event*> eventSample, DecisionTree *dt, Int_t iTree, UInt_t cls )
01088 {
01089
01090
01091
01092 if (fBoostType=="AdaBoost") return this->AdaBoost (eventSample, dt);
01093 else if (fBoostType=="Bagging") return this->Bagging (eventSample, iTree);
01094 else if (fBoostType=="RegBoost") return this->RegBoost (eventSample, dt);
01095 else if (fBoostType=="AdaBoostR2") return this->AdaBoostR2(eventSample, dt);
01096 else if (fBoostType=="Grad"){
01097 if(DoRegression())
01098 return this->GradBoostRegression(eventSample, dt);
01099 else if(DoMulticlass())
01100 return this->GradBoost (eventSample, dt, cls);
01101 else
01102 return this->GradBoost (eventSample, dt);
01103 }
01104 else {
01105 Log() << kINFO << GetOptions() << Endl;
01106 Log() << kFATAL << "<Boost> unknown boost option " << fBoostType<< " called" << Endl;
01107 }
01108
01109 return -1;
01110 }
01111
01112
01113 Double_t TMVA::MethodBDT::AdaBoost( vector<TMVA::Event*> eventSample, DecisionTree *dt )
01114 {
01115
01116
01117
01118
01119
01120
01121
01122
01123
01124
01125 Double_t err=0, sumGlobalw=0, sumGlobalwfalse=0, sumGlobalwfalse2=0;
01126
01127 vector<Double_t> sumw;
01128
01129 Double_t maxDev=0;
01130 for (vector<TMVA::Event*>::iterator e=eventSample.begin(); e!=eventSample.end();e++) {
01131 Double_t w = (*e)->GetWeight();
01132 sumGlobalw += w;
01133 UInt_t iclass=(*e)->GetClass();
01134 if (iclass+1 > sumw.size()) sumw.resize(iclass+1,0);
01135 sumw[iclass] += w;
01136
01137 if ( DoRegression() ) {
01138 Double_t tmpDev = TMath::Abs(dt->CheckEvent(*(*e),kFALSE) - (*e)->GetTarget(0) );
01139 sumGlobalwfalse += w * tmpDev;
01140 sumGlobalwfalse2 += w * tmpDev*tmpDev;
01141 if (tmpDev > maxDev) maxDev = tmpDev;
01142 }else{
01143 Bool_t isSignalType = (dt->CheckEvent(*(*e),fUseYesNoLeaf) > fNodePurityLimit );
01144
01145 if (!(isSignalType == DataInfo().IsSignal(*e))) {
01146 sumGlobalwfalse+= w;
01147 }
01148 }
01149 }
01150 err = sumGlobalwfalse/sumGlobalw ;
01151 if ( DoRegression() ) {
01152
01153 if (fAdaBoostR2Loss=="linear"){
01154 err = sumGlobalwfalse/maxDev/sumGlobalw ;
01155 }
01156 else if (fAdaBoostR2Loss=="quadratic"){
01157 err = sumGlobalwfalse2/maxDev/maxDev/sumGlobalw ;
01158 }
01159 else if (fAdaBoostR2Loss=="exponential"){
01160 err = 0;
01161 for (vector<TMVA::Event*>::iterator e=eventSample.begin(); e!=eventSample.end();e++) {
01162 Double_t w = (*e)->GetWeight();
01163 Double_t tmpDev = TMath::Abs(dt->CheckEvent(*(*e),kFALSE) - (*e)->GetTarget(0) );
01164 err += w * (1 - exp (-tmpDev/maxDev)) / sumGlobalw;
01165 }
01166
01167 }
01168 else {
01169 Log() << kFATAL << " you've chosen a Loss type for Adaboost other than linear, quadratic or exponential "
01170 << " namely " << fAdaBoostR2Loss << "\n"
01171 << "and this is not implemented... a typo in the options ??" <<Endl;
01172 }
01173 }
01174 Double_t newSumGlobalw=0;
01175 vector<Double_t> newSumw(sumw.size(),0);
01176
01177 Double_t boostWeight=1.;
01178 if (err >= 0.5) {
01179
01180
01181 if (dt->GetNNodes() == 1){
01182 Log() << kERROR << " YOUR tree has only 1 Node... kind of a funny *tree*. I cannot "
01183 << "boost such a thing... if after 1 step the error rate is == 0.5"
01184 << Endl
01185 << "please check why this happens, maybe too many events per node requested ?"
01186 << Endl;
01187
01188 }else{
01189 Log() << kERROR << " The error rate in the BDT boosting is > 0.5. ("<< err
01190 << ") That should not happen, please check your code (i.e... the BDT code), I "
01191 << " set it to 0.5.. just to continue.." << Endl;
01192 }
01193 err = 0.5;
01194 } else if (err < 0) {
01195 Log() << kERROR << " The error rate in the BDT boosting is < 0. That can happen"
01196 << " due to improper treatment of negative weights in a Monte Carlo.. (if you have"
01197 << " an idea on how to do it in a better way, please let me know (Helge.Voss@cern.ch)"
01198 << " for the time being I set it to its absolute value.. just to continue.." << Endl;
01199 err = TMath::Abs(err);
01200 }
01201 if (fAdaBoostBeta == 1) {
01202 boostWeight = (1.-err)/err;
01203 }
01204 else {
01205 boostWeight = TMath::Power((1.0 - err)/err, fAdaBoostBeta);
01206 }
01207
01208 Results* results = Data()->GetResults(GetMethodName(),Types::kTraining, Types::kMaxAnalysisType);
01209
01210 for (vector<TMVA::Event*>::iterator e=eventSample.begin(); e!=eventSample.end();e++) {
01211
01212 if ((!( (dt->CheckEvent(*(*e),fUseYesNoLeaf) > fNodePurityLimit ) == DataInfo().IsSignal(*e))) || DoRegression()) {
01213 Double_t boostfactor = boostWeight;
01214 if (DoRegression()) boostfactor = TMath::Power(1/boostWeight,(1.-TMath::Abs(dt->CheckEvent(*(*e),kFALSE) - (*e)->GetTarget(0) )/maxDev ) );
01215 if ( (*e)->GetWeight() > 0 ){
01216 (*e)->SetBoostWeight( (*e)->GetBoostWeight() * boostfactor);
01217
01218 if (DoRegression()) results->GetHist("BoostWeights")->Fill(boostfactor);
01219 } else {
01220 (*e)->ScaleBoostWeight( 1. / boostfactor);
01221 }
01222 }
01223 newSumGlobalw+=(*e)->GetWeight();
01224 newSumw[(*e)->GetClass()] += (*e)->GetWeight();
01225 }
01226
01227
01228 Double_t globalNormWeight=sumGlobalw/newSumGlobalw;
01229 vector<Double_t> normWeightByClass;
01230 for (UInt_t i=0; i<sumw.size(); i++) normWeightByClass.push_back(sumw[i]/newSumw[i]);
01231
01232 for (vector<TMVA::Event*>::iterator e=eventSample.begin(); e!=eventSample.end();e++) {
01233 if (fRenormByClass) (*e)->ScaleBoostWeight( normWeightByClass[(*e)->GetClass()] );
01234 else (*e)->ScaleBoostWeight( globalNormWeight );
01235 }
01236
01237 if (!(DoRegression()))results->GetHist("BoostWeights")->Fill(boostWeight);
01238 results->GetHist("BoostWeightsVsTree")->SetBinContent(fForest.size(),boostWeight);
01239 results->GetHist("ErrorFrac")->SetBinContent(fForest.size(),err);
01240
01241 fBoostWeight = boostWeight;
01242 fErrorFraction = err;
01243
01244 return TMath::Log(boostWeight);
01245 }
01246
01247
01248 Double_t TMVA::MethodBDT::Bagging( vector<TMVA::Event*> eventSample, Int_t iTree )
01249 {
01250
01251
01252 Double_t newSumw=0;
01253 Double_t newWeight;
01254 TRandom3 *trandom = new TRandom3(iTree);
01255 Double_t eventFraction = fUseNTrainEvents/Data()->GetNTrainingEvents();
01256 for (vector<TMVA::Event*>::iterator e=eventSample.begin(); e!=eventSample.end();e++) {
01257 newWeight = trandom->PoissonD(eventFraction);
01258 (*e)->SetBoostWeight(newWeight);
01259 newSumw+=(*e)->GetBoostWeight();
01260 }
01261 Double_t normWeight = eventSample.size() / newSumw ;
01262 for (vector<TMVA::Event*>::iterator e=eventSample.begin(); e!=eventSample.end();e++) {
01263 (*e)->SetBoostWeight( (*e)->GetBoostWeight() * normWeight );
01264
01265 }
01266 delete trandom;
01267 return 1.;
01268 }
01269
01270
01271 Double_t TMVA::MethodBDT::RegBoost( vector<TMVA::Event*> , DecisionTree* )
01272 {
01273
01274
01275
01276 return 1;
01277 }
01278
01279
01280 Double_t TMVA::MethodBDT::AdaBoostR2( vector<TMVA::Event*> eventSample, DecisionTree *dt )
01281 {
01282
01283
01284 if ( !DoRegression() ) Log() << kFATAL << "Somehow you chose a regression boost method for a classification job" << Endl;
01285
01286 Double_t err=0, sumw=0, sumwfalse=0, sumwfalse2=0;
01287 Double_t maxDev=0;
01288 for (vector<TMVA::Event*>::iterator e=eventSample.begin(); e!=eventSample.end();e++) {
01289 Double_t w = (*e)->GetWeight();
01290 sumw += w;
01291
01292 Double_t tmpDev = TMath::Abs(dt->CheckEvent(*(*e),kFALSE) - (*e)->GetTarget(0) );
01293 sumwfalse += w * tmpDev;
01294 sumwfalse2 += w * tmpDev*tmpDev;
01295 if (tmpDev > maxDev) maxDev = tmpDev;
01296 }
01297
01298
01299 if (fAdaBoostR2Loss=="linear"){
01300 err = sumwfalse/maxDev/sumw ;
01301 }
01302 else if (fAdaBoostR2Loss=="quadratic"){
01303 err = sumwfalse2/maxDev/maxDev/sumw ;
01304 }
01305 else if (fAdaBoostR2Loss=="exponential"){
01306 err = 0;
01307 for (vector<TMVA::Event*>::iterator e=eventSample.begin(); e!=eventSample.end();e++) {
01308 Double_t w = (*e)->GetWeight();
01309 Double_t tmpDev = TMath::Abs(dt->CheckEvent(*(*e),kFALSE) - (*e)->GetTarget(0) );
01310 err += w * (1 - exp (-tmpDev/maxDev)) / sumw;
01311 }
01312
01313 }
01314 else {
01315 Log() << kFATAL << " you've chosen a Loss type for Adaboost other than linear, quadratic or exponential "
01316 << " namely " << fAdaBoostR2Loss << "\n"
01317 << "and this is not implemented... a typo in the options ??" <<Endl;
01318 }
01319
01320
01321 if (err >= 0.5) {
01322
01323
01324 if (dt->GetNNodes() == 1){
01325 Log() << kERROR << " YOUR tree has only 1 Node... kind of a funny *tree*. I cannot "
01326 << "boost such a thing... if after 1 step the error rate is == 0.5"
01327 << Endl
01328 << "please check why this happens, maybe too many events per node requested ?"
01329 << Endl;
01330
01331 }else{
01332 Log() << kERROR << " The error rate in the BDT boosting is > 0.5. ("<< err
01333 << ") That should not happen, but is possible for regression trees, and"
01334 << " should trigger a stop for the boosting. please check your code (i.e... the BDT code), I "
01335 << " set it to 0.5.. just to continue.." << Endl;
01336 }
01337 err = 0.5;
01338 } else if (err < 0) {
01339 Log() << kERROR << " The error rate in the BDT boosting is < 0. That can happen"
01340 << " due to improper treatment of negative weights in a Monte Carlo.. (if you have"
01341 << " an idea on how to do it in a better way, please let me know (Helge.Voss@cern.ch)"
01342 << " for the time being I set it to its absolute value.. just to continue.." << Endl;
01343 err = TMath::Abs(err);
01344 }
01345
01346 Double_t boostWeight = err / (1.-err);
01347 Double_t newSumw=0;
01348
01349 Results* results = Data()->GetResults(GetMethodName(), Types::kTraining, Types::kMaxAnalysisType);
01350
01351 for (vector<TMVA::Event*>::iterator e=eventSample.begin(); e!=eventSample.end();e++) {
01352 Double_t boostfactor = TMath::Power(boostWeight,(1.-TMath::Abs(dt->CheckEvent(*(*e),kFALSE) - (*e)->GetTarget(0) )/maxDev ) );
01353 results->GetHist("BoostWeights")->Fill(boostfactor);
01354
01355 if ( (*e)->GetWeight() > 0 ){
01356 Float_t newBoostWeight = (*e)->GetBoostWeight() * boostfactor;
01357 Float_t newWeight = (*e)->GetWeight() * (*e)->GetBoostWeight() * boostfactor;
01358 if (newWeight == 0) {
01359 Log() << kINFO << "Weight= " << (*e)->GetWeight() << Endl;
01360 Log() << kINFO << "BoostWeight= " << (*e)->GetBoostWeight() << Endl;
01361 Log() << kINFO << "boostweight="<<boostWeight << " err= " <<err << Endl;
01362 Log() << kINFO << "NewBoostWeight= " << newBoostWeight << Endl;
01363 Log() << kINFO << "boostfactor= " << boostfactor << Endl;
01364 Log() << kINFO << "maxDev = " << maxDev << Endl;
01365 Log() << kINFO << "tmpDev = " << TMath::Abs(dt->CheckEvent(*(*e),kFALSE) - (*e)->GetTarget(0) ) << Endl;
01366 Log() << kINFO << "target = " << (*e)->GetTarget(0) << Endl;
01367 Log() << kINFO << "estimate = " << dt->CheckEvent(*(*e),kFALSE) << Endl;
01368 }
01369 (*e)->SetBoostWeight( newBoostWeight );
01370
01371 } else {
01372 (*e)->SetBoostWeight( (*e)->GetBoostWeight() / boostfactor);
01373 }
01374 newSumw+=(*e)->GetWeight();
01375 }
01376
01377
01378 Double_t normWeight = sumw / newSumw;
01379 for (vector<TMVA::Event*>::iterator e=eventSample.begin(); e!=eventSample.end();e++) {
01380
01381
01382 (*e)->SetBoostWeight( (*e)->GetBoostWeight() * normWeight );
01383 }
01384
01385
01386 results->GetHist("BoostWeightsVsTree")->SetBinContent(fForest.size(),1./boostWeight);
01387 results->GetHist("ErrorFrac")->SetBinContent(fForest.size(),err);
01388
01389 fBoostWeight = boostWeight;
01390 fErrorFraction = err;
01391
01392 return TMath::Log(1./boostWeight);
01393 }
01394
01395
01396 void TMVA::MethodBDT::AddWeightsXMLTo( void* parent ) const
01397 {
01398
01399 void* wght = gTools().AddChild(parent, "Weights");
01400 gTools().AddAttr( wght, "NTrees", fForest.size() );
01401 gTools().AddAttr( wght, "AnalysisType", fForest.back()->GetAnalysisType() );
01402
01403 for (UInt_t i=0; i< fForest.size(); i++) {
01404 void* trxml = fForest[i]->AddXMLTo(wght);
01405 gTools().AddAttr( trxml, "boostWeight", fBoostWeights[i] );
01406 gTools().AddAttr( trxml, "itree", i );
01407 }
01408 }
01409
01410
01411 void TMVA::MethodBDT::ReadWeightsFromXML(void* parent) {
01412
01413
01414 UInt_t i;
01415 for (i=0; i<fForest.size(); i++) delete fForest[i];
01416 fForest.clear();
01417 fBoostWeights.clear();
01418
01419 UInt_t ntrees;
01420 UInt_t analysisType;
01421 Float_t boostWeight;
01422
01423 gTools().ReadAttr( parent, "NTrees", ntrees );
01424
01425 if(gTools().HasAttr(parent, "TreeType")) {
01426 gTools().ReadAttr( parent, "TreeType", analysisType );
01427 } else {
01428 gTools().ReadAttr( parent, "AnalysisType", analysisType );
01429 }
01430
01431 void* ch = gTools().GetChild(parent);
01432 i=0;
01433 while(ch) {
01434 fForest.push_back( dynamic_cast<DecisionTree*>( DecisionTree::CreateFromXML(ch, GetTrainingTMVAVersionCode()) ) );
01435 fForest.back()->SetAnalysisType(Types::EAnalysisType(analysisType));
01436 fForest.back()->SetTreeID(i++);
01437 gTools().ReadAttr(ch,"boostWeight",boostWeight);
01438 fBoostWeights.push_back(boostWeight);
01439 ch = gTools().GetNextChild(ch);
01440 }
01441 }
01442
01443
01444 void TMVA::MethodBDT::ReadWeightsFromStream( istream& istr )
01445 {
01446
01447 TString dummy;
01448
01449 Int_t analysisType(0);
01450
01451
01452 istr >> dummy >> fNTrees;
01453 Log() << kINFO << "Read " << fNTrees << " Decision trees" << Endl;
01454
01455 for (UInt_t i=0;i<fForest.size();i++) delete fForest[i];
01456 fForest.clear();
01457 fBoostWeights.clear();
01458 Int_t iTree;
01459 Double_t boostWeight;
01460 for (int i=0;i<fNTrees;i++) {
01461 istr >> dummy >> iTree >> dummy >> boostWeight;
01462 if (iTree != i) {
01463 fForest.back()->Print( cout );
01464 Log() << kFATAL << "Error while reading weight file; mismatch iTree="
01465 << iTree << " i=" << i
01466 << " dummy " << dummy
01467 << " boostweight " << boostWeight
01468 << Endl;
01469 }
01470 fForest.push_back( new DecisionTree() );
01471 fForest.back()->SetAnalysisType(Types::EAnalysisType(analysisType));
01472 fForest.back()->SetTreeID(i);
01473 fForest.back()->Read(istr, GetTrainingTMVAVersionCode());
01474 fBoostWeights.push_back(boostWeight);
01475 }
01476 }
01477
01478
01479 Double_t TMVA::MethodBDT::GetMvaValue( Double_t* err, Double_t* errUpper ){
01480 return this->GetMvaValue( err, errUpper, 0 );
01481 }
01482
01483
01484 Double_t TMVA::MethodBDT::GetMvaValue( Double_t* err, Double_t* errUpper, UInt_t useNTrees )
01485 {
01486
01487
01488
01489
01490
01491 NoErrorCalc(err, errUpper);
01492
01493
01494
01495 UInt_t nTrees = fForest.size();
01496 if (useNTrees > 0 ) nTrees = useNTrees;
01497
01498 if (fBoostType=="Grad") return GetGradBoostMVA(const_cast<TMVA::Event&>(*GetEvent()),nTrees);
01499
01500 Double_t myMVA = 0;
01501 Double_t norm = 0;
01502 for (UInt_t itree=0; itree<nTrees; itree++) {
01503
01504 if (fUseWeightedTrees) {
01505 myMVA += fBoostWeights[itree] * fForest[itree]->CheckEvent(*GetEvent(),fUseYesNoLeaf);
01506 norm += fBoostWeights[itree];
01507 }
01508 else {
01509 myMVA += fForest[itree]->CheckEvent(*GetEvent(),fUseYesNoLeaf);
01510 norm += 1;
01511 }
01512 }
01513 return ( norm > std::numeric_limits<double>::epsilon() ) ? myMVA /= norm : 0 ;
01514 }
01515
01516
01517 const std::vector<Float_t>& TMVA::MethodBDT::GetMulticlassValues()
01518 {
01519
01520
01521 const TMVA::Event& e = *GetEvent();
01522 if (fMulticlassReturnVal == NULL) fMulticlassReturnVal = new std::vector<Float_t>();
01523 fMulticlassReturnVal->clear();
01524
01525 std::vector<double> temp;
01526
01527 UInt_t nClasses = DataInfo().GetNClasses();
01528 for(UInt_t iClass=0; iClass<nClasses; iClass++){
01529 temp.push_back(0.0);
01530 for(UInt_t itree = iClass; itree<fForest.size(); itree+=nClasses){
01531 temp[iClass] += fForest[itree]->CheckEvent(e,kFALSE);
01532 }
01533 }
01534
01535 for(UInt_t iClass=0; iClass<nClasses; iClass++){
01536 Double_t norm = 0.0;
01537 for(UInt_t j=0;j<nClasses;j++){
01538 if(iClass!=j)
01539 norm+=exp(temp[j]-temp[iClass]);
01540 }
01541 (*fMulticlassReturnVal).push_back(1.0/(1.0+norm));
01542 }
01543
01544
01545 return *fMulticlassReturnVal;
01546 }
01547
01548
01549
01550
01551
01552 const std::vector<Float_t> & TMVA::MethodBDT::GetRegressionValues()
01553 {
01554
01555
01556
01557 if (fRegressionReturnVal == NULL) fRegressionReturnVal = new std::vector<Float_t>();
01558 fRegressionReturnVal->clear();
01559
01560 const Event * ev = GetEvent();
01561 Event * evT = new Event(*ev);
01562
01563 Double_t myMVA = 0;
01564 Double_t norm = 0;
01565 if (fBoostType=="AdaBoostR2") {
01566
01567
01568
01569
01570
01571
01572
01573
01574
01575
01576 vector< Double_t > response(fForest.size());
01577 vector< Double_t > weight(fForest.size());
01578 Double_t totalSumOfWeights = 0;
01579
01580 for (UInt_t itree=0; itree<fForest.size(); itree++) {
01581 response[itree] = fForest[itree]->CheckEvent(*ev,kFALSE);
01582 weight[itree] = fBoostWeights[itree];
01583 totalSumOfWeights += fBoostWeights[itree];
01584 }
01585
01586 vector< vector<Double_t> > vtemp;
01587 vtemp.push_back( response );
01588 vtemp.push_back( weight );
01589 gTools().UsefulSortAscending( vtemp );
01590
01591 Int_t t=0;
01592 Double_t sumOfWeights = 0;
01593 while (sumOfWeights <= totalSumOfWeights/2.) {
01594 sumOfWeights += vtemp[1][t];
01595 t++;
01596 }
01597
01598 Double_t rVal=0;
01599 Int_t count=0;
01600 for (UInt_t i= TMath::Max(UInt_t(0),UInt_t(t-(fForest.size()/6)-0.5));
01601 i< TMath::Min(UInt_t(fForest.size()),UInt_t(t+(fForest.size()/6)+0.5)); i++) {
01602 count++;
01603 rVal+=vtemp[0][i];
01604 }
01605
01606 evT->SetTarget(0, rVal/Double_t(count) );
01607 }
01608 else if(fBoostType=="Grad"){
01609 for (UInt_t itree=0; itree<fForest.size(); itree++) {
01610 myMVA += fForest[itree]->CheckEvent(*ev,kFALSE);
01611 }
01612
01613 evT->SetTarget(0, myMVA+fBoostWeights[0] );
01614 }
01615 else{
01616 for (UInt_t itree=0; itree<fForest.size(); itree++) {
01617
01618 if (fUseWeightedTrees) {
01619 myMVA += fBoostWeights[itree] * fForest[itree]->CheckEvent(*ev,kFALSE);
01620 norm += fBoostWeights[itree];
01621 }
01622 else {
01623 myMVA += fForest[itree]->CheckEvent(*ev,kFALSE);
01624 norm += 1;
01625 }
01626 }
01627
01628 evT->SetTarget(0, ( norm > std::numeric_limits<double>::epsilon() ) ? myMVA /= norm : 0 );
01629 }
01630
01631
01632
01633 const Event* evT2 = GetTransformationHandler().InverseTransform( evT );
01634 fRegressionReturnVal->push_back( evT2->GetTarget(0) );
01635
01636 delete evT;
01637
01638
01639 return *fRegressionReturnVal;
01640 }
01641
01642
01643 void TMVA::MethodBDT::WriteMonitoringHistosToFile( void ) const
01644 {
01645
01646
01647 Log() << kINFO << "Write monitoring histograms to file: " << BaseDir()->GetPath() << Endl;
01648
01649
01650
01651 fMonitorNtuple->Write();
01652 }
01653
01654
01655 vector< Double_t > TMVA::MethodBDT::GetVariableImportance()
01656 {
01657
01658
01659
01660
01661
01662 fVariableImportance.resize(GetNvar());
01663 Double_t sum=0;
01664 for (int itree = 0; itree < fNTrees; itree++) {
01665 vector<Double_t> relativeImportance(fForest[itree]->GetVariableImportance());
01666 for (UInt_t i=0; i< relativeImportance.size(); i++) {
01667 fVariableImportance[i] += relativeImportance[i];
01668 }
01669 }
01670 for (UInt_t i=0; i< fVariableImportance.size(); i++) sum += fVariableImportance[i];
01671 for (UInt_t i=0; i< fVariableImportance.size(); i++) fVariableImportance[i] /= sum;
01672
01673 return fVariableImportance;
01674 }
01675
01676
01677 Double_t TMVA::MethodBDT::GetVariableImportance( UInt_t ivar )
01678 {
01679
01680
01681
01682
01683 vector<Double_t> relativeImportance = this->GetVariableImportance();
01684 if (ivar < (UInt_t)relativeImportance.size()) return relativeImportance[ivar];
01685 else Log() << kFATAL << "<GetVariableImportance> ivar = " << ivar << " is out of range " << Endl;
01686
01687 return -1;
01688 }
01689
01690
01691 const TMVA::Ranking* TMVA::MethodBDT::CreateRanking()
01692 {
01693
01694
01695
01696 fRanking = new Ranking( GetName(), "Variable Importance" );
01697 vector< Double_t> importance(this->GetVariableImportance());
01698
01699 for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
01700
01701 fRanking->AddRank( Rank( GetInputLabel(ivar), importance[ivar] ) );
01702 }
01703
01704 return fRanking;
01705 }
01706
01707
01708 void TMVA::MethodBDT::GetHelpMessage() const
01709 {
01710
01711
01712
01713
01714 Log() << Endl;
01715 Log() << gTools().Color("bold") << "--- Short description:" << gTools().Color("reset") << Endl;
01716 Log() << Endl;
01717 Log() << "Boosted Decision Trees are a collection of individual decision" << Endl;
01718 Log() << "trees which form a multivariate classifier by (weighted) majority " << Endl;
01719 Log() << "vote of the individual trees. Consecutive decision trees are " << Endl;
01720 Log() << "trained using the original training data set with re-weighted " << Endl;
01721 Log() << "events. By default, the AdaBoost method is employed, which gives " << Endl;
01722 Log() << "events that were misclassified in the previous tree a larger " << Endl;
01723 Log() << "weight in the training of the following tree." << Endl;
01724 Log() << Endl;
01725 Log() << "Decision trees are a sequence of binary splits of the data sample" << Endl;
01726 Log() << "using a single descriminant variable at a time. A test event " << Endl;
01727 Log() << "ending up after the sequence of left-right splits in a final " << Endl;
01728 Log() << "(\"leaf\") node is classified as either signal or background" << Endl;
01729 Log() << "depending on the majority type of training events in that node." << Endl;
01730 Log() << Endl;
01731 Log() << gTools().Color("bold") << "--- Performance optimisation:" << gTools().Color("reset") << Endl;
01732 Log() << Endl;
01733 Log() << "By the nature of the binary splits performed on the individual" << Endl;
01734 Log() << "variables, decision trees do not deal well with linear correlations" << Endl;
01735 Log() << "between variables (they need to approximate the linear split in" << Endl;
01736 Log() << "the two dimensional space by a sequence of splits on the two " << Endl;
01737 Log() << "variables individually). Hence decorrelation could be useful " << Endl;
01738 Log() << "to optimise the BDT performance." << Endl;
01739 Log() << Endl;
01740 Log() << gTools().Color("bold") << "--- Performance tuning via configuration options:" << gTools().Color("reset") << Endl;
01741 Log() << Endl;
01742 Log() << "The two most important parameters in the configuration are the " << Endl;
01743 Log() << "minimal number of events requested by a leaf node (option " << Endl;
01744 Log() << "\"nEventsMin\"). If this number is too large, detailed features " << Endl;
01745 Log() << "in the parameter space cannot be modelled. If it is too small, " << Endl;
01746 Log() << "the risk to overtrain rises." << Endl;
01747 Log() << " (Imagine the decision tree is split until the leaf node contains" << Endl;
01748 Log() << " only a single event. In such a case, no training event is " << Endl;
01749 Log() << " misclassified, while the situation will look very different" << Endl;
01750 Log() << " for the test sample.)" << Endl;
01751 Log() << Endl;
01752 Log() << "The default minimal number is currently set to " << Endl;
01753 Log() << " max(20, (N_training_events / N_variables^2 / 10)) " << Endl;
01754 Log() << "and can be changed by the user." << Endl;
01755 Log() << Endl;
01756 Log() << "The other crucial parameter, the pruning strength (\"PruneStrength\")," << Endl;
01757 Log() << "is also related to overtraining. It is a regularisation parameter " << Endl;
01758 Log() << "that is used when determining after the training which splits " << Endl;
01759 Log() << "are considered statistically insignificant and are removed. The" << Endl;
01760 Log() << "user is advised to carefully watch the BDT screen output for" << Endl;
01761 Log() << "the comparison between efficiencies obtained on the training and" << Endl;
01762 Log() << "the independent test sample. They should be equal within statistical" << Endl;
01763 Log() << "errors, in order to minimize statistical fluctuations in different samples." << Endl;
01764 }
01765
01766
01767 void TMVA::MethodBDT::MakeClassSpecific( std::ostream& fout, const TString& className ) const
01768 {
01769
01770
01771
01772 fout << " std::vector<BDT_DecisionTreeNode*> fForest; // i.e. root nodes of decision trees" << endl;
01773 fout << " std::vector<double> fBoostWeights; // the weights applied in the individual boosts" << endl;
01774 fout << "};" << endl << endl;
01775 fout << "double " << className << "::GetMvaValue__( const std::vector<double>& inputValues ) const" << endl;
01776 fout << "{" << endl;
01777 fout << " double myMVA = 0;" << endl;
01778 if (fBoostType!="Grad"){
01779 fout << " double norm = 0;" << endl;
01780 }
01781 fout << " for (unsigned int itree=0; itree<fForest.size(); itree++){" << endl;
01782 fout << " BDT_DecisionTreeNode *current = fForest[itree];" << endl;
01783 fout << " while (current->GetNodeType() == 0) { //intermediate node" << endl;
01784 fout << " if (current->GoesRight(inputValues)) current=(BDT_DecisionTreeNode*)current->GetRight();" << endl;
01785 fout << " else current=(BDT_DecisionTreeNode*)current->GetLeft();" << endl;
01786 fout << " }" << endl;
01787 if (fBoostType=="Grad"){
01788 fout << " myMVA += current->GetResponse();" << endl;
01789 }
01790 else if (fUseWeightedTrees) {
01791 if (fUseYesNoLeaf) fout << " myMVA += fBoostWeights[itree] * current->GetNodeType();" << endl;
01792 else fout << " myMVA += fBoostWeights[itree] * current->GetPurity();" << endl;
01793 fout << " norm += fBoostWeights[itree];" << endl;
01794 }
01795 else {
01796 if (fUseYesNoLeaf) fout << " myMVA += current->GetNodeType();" << endl;
01797 else fout << " myMVA += current->GetPurity();" << endl;
01798 fout << " norm += 1.;" << endl;
01799 }
01800 fout << " }" << endl;
01801 if (fBoostType=="Grad"){
01802 fout << " return 2.0/(1.0+exp(-2.0*myMVA))-1.0;" << endl;
01803 }
01804 else fout << " return myMVA /= norm;" << endl;
01805 fout << "};" << endl << endl;
01806 fout << "void " << className << "::Initialize()" << endl;
01807 fout << "{" << endl;
01808
01809 for (int itree=0; itree<fNTrees; itree++) {
01810 fout << " // itree = " << itree << endl;
01811 fout << " fBoostWeights.push_back(" << fBoostWeights[itree] << ");" << endl;
01812 fout << " fForest.push_back( " << endl;
01813 this->MakeClassInstantiateNode((DecisionTreeNode*)fForest[itree]->GetRoot(), fout, className);
01814 fout <<" );" << endl;
01815 }
01816 fout << " return;" << endl;
01817 fout << "};" << endl;
01818 fout << " " << endl;
01819 fout << "// Clean up" << endl;
01820 fout << "inline void " << className << "::Clear() " << endl;
01821 fout << "{" << endl;
01822 fout << " for (unsigned int itree=0; itree<fForest.size(); itree++) { " << endl;
01823 fout << " delete fForest[itree]; " << endl;
01824 fout << " }" << endl;
01825 fout << "}" << endl;
01826 }
01827
01828
01829 void TMVA::MethodBDT::MakeClassSpecificHeader( std::ostream& fout, const TString& ) const
01830 {
01831
01832 fout << "#ifndef NN" << endl;
01833 fout << "#define NN new BDT_DecisionTreeNode" << endl;
01834 fout << "#endif" << endl;
01835 fout << " " << endl;
01836 fout << "#ifndef BDT_DecisionTreeNode__def" << endl;
01837 fout << "#define BDT_DecisionTreeNode__def" << endl;
01838 fout << " " << endl;
01839 fout << "class BDT_DecisionTreeNode {" << endl;
01840 fout << " " << endl;
01841 fout << "public:" << endl;
01842 fout << " " << endl;
01843 fout << " // constructor of an essentially \"empty\" node floating in space" << endl;
01844 fout << " BDT_DecisionTreeNode ( BDT_DecisionTreeNode* left," << endl;
01845 fout << " BDT_DecisionTreeNode* right," << endl;
01846 fout << " double cutValue, bool cutType, int selector," << endl;
01847 fout << " int nodeType, double purity, double response ) :" << endl;
01848 fout << " fLeft ( left )," << endl;
01849 fout << " fRight ( right )," << endl;
01850 fout << " fCutValue( cutValue )," << endl;
01851 fout << " fCutType ( cutType )," << endl;
01852 fout << " fSelector( selector )," << endl;
01853 fout << " fNodeType( nodeType )," << endl;
01854 fout << " fPurity ( purity )," << endl;
01855 fout << " fResponse( response ){}" << endl << endl;
01856 fout << " virtual ~BDT_DecisionTreeNode();" << endl << endl;
01857 fout << " // test event if it decends the tree at this node to the right" << endl;
01858 fout << " virtual bool GoesRight( const std::vector<double>& inputValues ) const;" << endl;
01859 fout << " BDT_DecisionTreeNode* GetRight( void ) {return fRight; };" << endl << endl;
01860 fout << " // test event if it decends the tree at this node to the left " << endl;
01861 fout << " virtual bool GoesLeft ( const std::vector<double>& inputValues ) const;" << endl;
01862 fout << " BDT_DecisionTreeNode* GetLeft( void ) { return fLeft; }; " << endl << endl;
01863 fout << " // return S/(S+B) (purity) at this node (from training)" << endl << endl;
01864 fout << " double GetPurity( void ) const { return fPurity; } " << endl;
01865 fout << " // return the node type" << endl;
01866 fout << " int GetNodeType( void ) const { return fNodeType; }" << endl;
01867 fout << " double GetResponse(void) const {return fResponse;}" << endl << endl;
01868 fout << "private:" << endl << endl;
01869 fout << " BDT_DecisionTreeNode* fLeft; // pointer to the left daughter node" << endl;
01870 fout << " BDT_DecisionTreeNode* fRight; // pointer to the right daughter node" << endl;
01871 fout << " double fCutValue; // cut value appplied on this node to discriminate bkg against sig" << endl;
01872 fout << " bool fCutType; // true: if event variable > cutValue ==> signal , false otherwise" << endl;
01873 fout << " int fSelector; // index of variable used in node selection (decision tree) " << endl;
01874 fout << " int fNodeType; // Type of node: -1 == Bkg-leaf, 1 == Signal-leaf, 0 = internal " << endl;
01875 fout << " double fPurity; // Purity of node from training"<< endl;
01876 fout << " double fResponse; // Regression response value of node" << endl;
01877 fout << "}; " << endl;
01878 fout << " " << endl;
01879 fout << "//_______________________________________________________________________" << endl;
01880 fout << "BDT_DecisionTreeNode::~BDT_DecisionTreeNode()" << endl;
01881 fout << "{" << endl;
01882 fout << " if (fLeft != NULL) delete fLeft;" << endl;
01883 fout << " if (fRight != NULL) delete fRight;" << endl;
01884 fout << "}; " << endl;
01885 fout << " " << endl;
01886 fout << "//_______________________________________________________________________" << endl;
01887 fout << "bool BDT_DecisionTreeNode::GoesRight( const std::vector<double>& inputValues ) const" << endl;
01888 fout << "{" << endl;
01889 fout << " // test event if it decends the tree at this node to the right" << endl;
01890 fout << " bool result = (inputValues[fSelector] > fCutValue );" << endl;
01891 fout << " if (fCutType == true) return result; //the cuts are selecting Signal ;" << endl;
01892 fout << " else return !result;" << endl;
01893 fout << "}" << endl;
01894 fout << " " << endl;
01895 fout << "//_______________________________________________________________________" << endl;
01896 fout << "bool BDT_DecisionTreeNode::GoesLeft( const std::vector<double>& inputValues ) const" << endl;
01897 fout << "{" << endl;
01898 fout << " // test event if it decends the tree at this node to the left" << endl;
01899 fout << " if (!this->GoesRight(inputValues)) return true;" << endl;
01900 fout << " else return false;" << endl;
01901 fout << "}" << endl;
01902 fout << " " << endl;
01903 fout << "#endif" << endl;
01904 fout << " " << endl;
01905 }
01906
01907
01908 void TMVA::MethodBDT::MakeClassInstantiateNode( DecisionTreeNode *n, std::ostream& fout, const TString& className ) const
01909 {
01910
01911 if (n == NULL) {
01912 Log() << kFATAL << "MakeClassInstantiateNode: started with undefined node" <<Endl;
01913 return ;
01914 }
01915 fout << "NN("<<endl;
01916 if (n->GetLeft() != NULL){
01917 this->MakeClassInstantiateNode( (DecisionTreeNode*)n->GetLeft() , fout, className);
01918 }
01919 else {
01920 fout << "0";
01921 }
01922 fout << ", " <<endl;
01923 if (n->GetRight() != NULL){
01924 this->MakeClassInstantiateNode( (DecisionTreeNode*)n->GetRight(), fout, className );
01925 }
01926 else {
01927 fout << "0";
01928 }
01929 fout << ", " << endl
01930 << setprecision(6)
01931 << n->GetCutValue() << ", "
01932 << n->GetCutType() << ", "
01933 << n->GetSelector() << ", "
01934 << n->GetNodeType() << ", "
01935 << n->GetPurity() << ","
01936 << n->GetResponse() << ") ";
01937
01938 }