MethodBDT.h

Go to the documentation of this file.
00001 // @(#)root/tmva $Id: MethodBDT.h 37986 2011-02-04 21:42:15Z pcanal $
00002 // Author: Andreas Hoecker, Joerg Stelzer, Helge Voss, Kai Voss
00003 
00004 /**********************************************************************************
00005  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis       *
00006  * Package: TMVA                                                                  *
00007  * Class  : MethodBDT  (Boosted Decision Trees)                                   *
00008  * Web    : http://tmva.sourceforge.net                                           *
00009  *                                                                                *
00010  * Description:                                                                   *
00011  *      Analysis of Boosted Decision Trees                                        *
00012  *                                                                                *
00013  * Authors (alphabetical):                                                        *
00014  *      Andreas Hoecker <Andreas.Hocker@cern.ch> - CERN, Switzerland              *
00015  *      Helge Voss      <Helge.Voss@cern.ch>     - MPI-K Heidelberg, Germany      *
00016  *      Kai Voss        <Kai.Voss@cern.ch>       - U. of Victoria, Canada         *
00017  *      Doug Schouten   <dschoute@sfu.ca>        - Simon Fraser U., Canada        *
00018  *      Jan Therhaag    <jan.therhaag@cern.ch>   - U. of Bonn, Germany            *
00019  *                                                                                *
00020  * Copyright (c) 2005:                                                            *
00021  *      CERN, Switzerland                                                         *
00022  *      U. of Victoria, Canada                                                    *
00023  *      MPI-K Heidelberg, Germany                                                 *
00024  *                                                                                *
00025  * Redistribution and use in source and binary forms, with or without             *
00026  * modification, are permitted according to the terms listed in LICENSE           *
00027  * (http://tmva.sourceforge.net/LICENSE)                                          *
00028  **********************************************************************************/
00029 
00030 #ifndef ROOT_TMVA_MethodBDT
00031 #define ROOT_TMVA_MethodBDT
00032 
00033 //////////////////////////////////////////////////////////////////////////
00034 //                                                                      //
00035 // MethodBDT                                                            //
00036 //                                                                      //
00037 // Analysis of Boosted Decision Trees                                   //
00038 //                                                                      //
00039 //////////////////////////////////////////////////////////////////////////
00040 
00041 #include <vector>
00042 #ifndef ROOT_TH2
00043 #include "TH2.h"
00044 #endif
00045 #ifndef ROOT_TTree
00046 #include "TTree.h"
00047 #endif
00048 #ifndef ROOT_TMVA_MethodBase
00049 #include "TMVA/MethodBase.h"
00050 #endif
00051 #ifndef ROOT_TMVA_DecisionTree
00052 #include "TMVA/DecisionTree.h"
00053 #endif
00054 #ifndef ROOT_TMVA_Event
00055 #include "TMVA/Event.h"
00056 #endif
00057 
00058 namespace TMVA {
00059 
00060    class SeparationBase;
00061 
00062    class MethodBDT : public MethodBase {
00063 
00064    public:
00065       // constructor for training and reading
00066       MethodBDT( const TString& jobName,
00067                  const TString& methodTitle,
00068                  DataSetInfo& theData,
00069                  const TString& theOption = "",
00070                  TDirectory* theTargetDir = 0 );
00071 
00072       // constructor for calculating BDT-MVA using previously generatad decision trees
00073       MethodBDT( DataSetInfo& theData,
00074                  const TString& theWeightFile,
00075                  TDirectory* theTargetDir = NULL );
00076 
00077       virtual ~MethodBDT( void );
00078 
00079       virtual Bool_t HasAnalysisType( Types::EAnalysisType type, UInt_t numberClasses, UInt_t numberTargets );
00080 
00081 
00082       // write all Events from the Tree into a vector of Events, that are
00083       // more easily manipulated
00084       void InitEventSample();
00085 
00086       // optimize tuning parameters
00087       virtual std::map<TString,Double_t> OptimizeTuningParameters(TString fomType="ROCIntegral", TString fitType="FitGA");
00088       virtual void SetTuneParameters(std::map<TString,Double_t> tuneParameters);
00089 
00090       // training method
00091       void Train( void );
00092 
00093       // revoke training
00094       void Reset( void );
00095 
00096       using MethodBase::ReadWeightsFromStream;
00097 
00098       // write weights to file
00099       void AddWeightsXMLTo( void* parent ) const;
00100 
00101       // read weights from file
00102       void ReadWeightsFromStream( istream& istr );
00103       void ReadWeightsFromXML(void* parent);
00104 
00105       // write method specific histos to target file
00106       void WriteMonitoringHistosToFile( void ) const;
00107 
00108       // calculate the MVA value
00109       Double_t GetMvaValue( Double_t* err = 0, Double_t* errUpper = 0);
00110 
00111    private:
00112       Double_t GetMvaValue( Double_t* err, Double_t* errUpper, UInt_t useNTrees );
00113 
00114    public:
00115       const std::vector<Float_t>& GetMulticlassValues();
00116 
00117       // regression response
00118       const std::vector<Float_t>& GetRegressionValues();
00119 
00120       // apply the boost algorithm to a tree in the collection
00121       Double_t Boost( std::vector<TMVA::Event*>, DecisionTree *dt, Int_t iTree, UInt_t cls = 0);
00122 
00123       // ranking of input variables
00124       const Ranking* CreateRanking();
00125 
00126       // the option handling methods
00127       void DeclareOptions();
00128       void ProcessOptions();
00129       void SetMaxDepth(Int_t d){fMaxDepth = d;}
00130       void SetNodeMinEvents(Int_t d){fNodeMinEvents = d;}
00131       void SetNTrees(Int_t d){fNTrees = d;}
00132       void SetAdaBoostBeta(Double_t b){fAdaBoostBeta = b;}
00133       void SetNodePurityLimit(Double_t l){fNodePurityLimit = l;}
00134 
00135 
00136       // get the forest
00137       inline const std::vector<TMVA::DecisionTree*> & GetForest() const;
00138 
00139       // get the forest
00140       inline const std::vector<TMVA::Event*> & GetTrainingEvents() const;
00141 
00142       inline const std::vector<double> & GetBoostWeights() const;
00143 
00144       //return the individual relative variable importance
00145       std::vector<Double_t> GetVariableImportance();
00146       Double_t GetVariableImportance(UInt_t ivar);
00147 
00148       Double_t TestTreeQuality( DecisionTree *dt );
00149 
00150       // make ROOT-independent C++ class for classifier response (classifier-specific implementation)
00151       void MakeClassSpecific( std::ostream&, const TString& ) const;
00152 
00153       // header and auxiliary classes
00154       void MakeClassSpecificHeader( std::ostream&, const TString& ) const;
00155 
00156       void MakeClassInstantiateNode( DecisionTreeNode *n, std::ostream& fout,
00157                                      const TString& className ) const;
00158 
00159       void GetHelpMessage() const;
00160 
00161       virtual Bool_t        IsSignalLike() { return GetMvaValue() > 0;}
00162    protected:
00163       void DeclareCompatibilityOptions();
00164 
00165    private:
00166       // Init used in the various constructors
00167       void Init( void );
00168 
00169       // boosting algorithm (adaptive boosting)
00170       Double_t AdaBoost( std::vector<TMVA::Event*>, DecisionTree *dt );
00171 
00172       // boosting as a random re-weighting
00173       Double_t Bagging( std::vector<TMVA::Event*>, Int_t iTree );
00174 
00175       // boosting special for regression
00176       Double_t RegBoost( std::vector<TMVA::Event*>, DecisionTree *dt );
00177 
00178       // adaboost adapted to regression
00179       Double_t AdaBoostR2( std::vector<TMVA::Event*>, DecisionTree *dt );
00180 
00181       // binomial likelihood gradient boost for classification
00182       // (see Friedman: "Greedy Function Approximation: a Gradient Boosting Machine"
00183       // Technical report, Dept. of Statistics, Stanford University)
00184       Double_t GradBoost( std::vector<TMVA::Event*>, DecisionTree *dt, UInt_t cls = 0);
00185       Double_t GradBoostRegression(std::vector<TMVA::Event*>, DecisionTree *dt );
00186       void InitGradBoost( std::vector<TMVA::Event*>);
00187       void UpdateTargets( std::vector<TMVA::Event*>, UInt_t cls = 0);
00188       void UpdateTargetsRegression( std::vector<TMVA::Event*>,Bool_t first=kFALSE);
00189       Double_t GetGradBoostMVA(TMVA::Event& e, UInt_t nTrees);
00190       void GetRandomSubSample();
00191       Double_t GetWeightedQuantile(std::vector<std::pair<Double_t, Double_t> > vec, const Double_t quantile, const Double_t SumOfWeights = 0.0);
00192 
00193       std::vector<TMVA::Event*>       fEventSample;     // the training events
00194       std::vector<TMVA::Event*>       fValidationSample;// the Validation events
00195       std::vector<TMVA::Event*>       fSubSample;       // subsample for bagged grad boost
00196       Int_t                           fNTrees;          // number of decision trees requested
00197       std::vector<DecisionTree*>      fForest;          // the collection of decision trees
00198       std::vector<double>             fBoostWeights;    // the weights applied in the individual boosts
00199       Bool_t                          fRenormByClass;   // individually re-normalize each event class to the original size after boosting
00200       TString                         fBoostType;       // string specifying the boost type
00201       Double_t                        fAdaBoostBeta;    // beta parameter for AdaBoost algorithm
00202       TString                         fAdaBoostR2Loss;  // loss type used in AdaBoostR2 (Linear,Quadratic or Exponential)
00203       Double_t                        fTransitionPoint; // break-down point for gradient regression
00204       Double_t                        fShrinkage;       // learning rate for gradient boost;
00205       Bool_t                          fBaggedGradBoost; // turn bagging in combination with grad boost on/off
00206       Double_t                        fSampleFraction;  // fraction of events used for bagged grad boost
00207       Double_t                        fSumOfWeights;    // sum of all event weights
00208       std::map< TMVA::Event*, std::pair<Double_t, Double_t> >       fWeightedResiduals;  // weighted regression residuals
00209       std::map< TMVA::Event*,std::vector<double> > fResiduals; // individual event residuals for gradient boost
00210 
00211       //options for the decision Tree
00212       SeparationBase                 *fSepType;         // the separation used in node splitting
00213       TString                         fSepTypeS;        // the separation (option string) used in node splitting
00214       Int_t                           fNodeMinEvents;   // min number of events in node
00215 
00216       Int_t                           fNCuts;           // grid used in cut applied in node splitting
00217       Bool_t                          fUseFisherCuts;   // use multivariate splits using the Fisher criterium
00218       Double_t                        fMinLinCorrForFisher; // the minimum linear correlation between two variables demanded for use in fisher criterium in node splitting
00219       Bool_t                          fUseExclusiveVars; // individual variables already used in fisher criterium are not anymore analysed individually for node splitting
00220       Bool_t                          fUseYesNoLeaf;    // use sig or bkg classification in leave nodes or sig/bkg
00221       Double_t                        fNodePurityLimit; // purity limit for sig/bkg nodes
00222       Bool_t                          fUseWeightedTrees;// use average classification from the trees, or have the individual trees trees in the forest weighted (e.g. log(boostweight) from AdaBoost
00223       UInt_t                          fNNodesMax;       // max # of nodes
00224       UInt_t                          fMaxDepth;        // max depth
00225 
00226       DecisionTree::EPruneMethod       fPruneMethod;     // method used for prunig
00227       TString                          fPruneMethodS;    // prune method option String
00228       Double_t                         fPruneStrength;   // a parameter to set the "amount" of pruning..needs to be adjusted
00229       Bool_t                           fPruneBeforeBoost;// flag to prune before boosting
00230       Double_t                         fFValidationEvents;    // fraction of events to use for pruning
00231       Bool_t                           fAutomatic;       // use user given prune strength or automatically determined one using a validation sample
00232       Bool_t                           fRandomisedTrees; // choose a random subset of possible cut variables at each node during training
00233       UInt_t                           fUseNvars;        // the number of variables used in the randomised tree splitting
00234       Bool_t                           fUsePoissonNvars; // use "fUseNvars" not as fixed number but as mean of a possion distr. in each split
00235       UInt_t                           fUseNTrainEvents; // number of randomly picked training events used in randomised (and bagged) trees
00236 
00237       Double_t                         fSampleSizeFraction; // relative size of bagged event sample to original sample size
00238       Bool_t                           fNoNegWeightsInTraining; // ignore negative event weights in the training
00239 
00240 
00241 
00242       //some histograms for monitoring
00243       TTree*                           fMonitorNtuple;   // monitoring ntuple
00244       Int_t                            fITree;           // ntuple var: ith tree
00245       Double_t                         fBoostWeight;     // ntuple var: boost weight
00246       Double_t                         fErrorFraction;   // ntuple var: misclassification error fraction
00247 
00248       std::vector<Double_t>            fVariableImportance; // the relative importance of the different variables
00249 
00250       // debugging flags
00251       static const Int_t               fgDebugLevel;     // debug level determining some printout/control plots etc.
00252 
00253       // for backward compatibility
00254 
00255       ClassDef(MethodBDT,0)  // Analysis of Boosted Decision Trees
00256    };
00257 
00258 } // namespace TMVA
00259 
00260 const std::vector<TMVA::DecisionTree*>& TMVA::MethodBDT::GetForest()         const { return fForest; }
00261 const std::vector<TMVA::Event*>&        TMVA::MethodBDT::GetTrainingEvents() const { return fEventSample; }
00262 const std::vector<double>&              TMVA::MethodBDT::GetBoostWeights()   const { return fBoostWeights; }
00263 
00264 #endif

Generated on Tue Jul 5 14:27:31 2011 for ROOT_528-00b_version by  doxygen 1.5.1