MethodRuleFit.cxx

Go to the documentation of this file.
00001 // @(#)root/tmva $Id: MethodRuleFit.cxx 36966 2010-11-26 09:50:13Z evt $
00002 // Author: Fredrik Tegenfeldt
00003 
00004 /**********************************************************************************
00005  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis       *
00006  * Package: TMVA                                                                  *
00007  * Class  : MethodRuleFit                                                         *
00008  * Web    : http://tmva.sourceforge.net                                           *
00009  *                                                                                *
00010  * Description:                                                                   *
00011  *      Implementation (see header file for description)                          *
00012  *                                                                                *
00013  * Authors (alphabetical):                                                        *
00014  *      Fredrik Tegenfeldt <Fredrik.Tegenfeldt@cern.ch>  - Iowa State U., USA     *
00015  *                                                                                *
00016  * Copyright (c) 2005:                                                            *
00017  *      CERN, Switzerland                                                         * 
00018  *      Iowa State U.                                                             *
00019  *      MPI-K Heidelberg, Germany                                                 * 
00020  *                                                                                *
00021  * Redistribution and use in source and binary forms, with or without             *
00022  * modification, are permitted according to the terms listed in LICENSE           *
00023  * (http://tmva.sourceforge.net/LICENSE)                                          *
00024  **********************************************************************************/
00025 
00026 //_______________________________________________________________________
00027 //
00028 // J Friedman's RuleFit method
00029 //_______________________________________________________________________
00030 
00031 #include <algorithm>
00032 #include <list>
00033 
00034 #include "Riostream.h"
00035 #include "TRandom3.h"
00036 #include "TMath.h"
00037 #include "TMatrix.h"
00038 #include "TDirectory.h"
00039 
00040 #include "TMVA/ClassifierFactory.h"
00041 #include "TMVA/GiniIndex.h"
00042 #include "TMVA/CrossEntropy.h"
00043 #include "TMVA/SdivSqrtSplusB.h"
00044 #include "TMVA/SeparationBase.h"
00045 #include "TMVA/MisClassificationError.h"
00046 #include "TMVA/MethodRuleFit.h"
00047 #include "TMVA/RuleFitAPI.h"
00048 #include "TMVA/Tools.h"
00049 #include "TMVA/Timer.h"
00050 #include "TMVA/Ranking.h"
00051 #include "TMVA/Config.h"
00052 #include "TMVA/MsgLogger.h"
00053 
00054 REGISTER_METHOD(RuleFit)
00055 
00056 ClassImp(TMVA::MethodRuleFit)
00057  
00058 //_______________________________________________________________________
00059 TMVA::MethodRuleFit::MethodRuleFit( const TString& jobName,
00060                                     const TString& methodTitle,
00061                                     DataSetInfo& theData, 
00062                                     const TString& theOption,
00063                                     TDirectory* theTargetDir ) :
00064    MethodBase( jobName, Types::kRuleFit, methodTitle, theData, theOption, theTargetDir )
00065    , fSignalFraction(0)
00066    , fNTImportance(0)
00067    , fNTCoefficient(0)
00068    , fNTSupport(0)
00069    , fNTNcuts(0)
00070    , fNTNvars(0)
00071    , fNTPtag(0)
00072    , fNTPss(0)
00073    , fNTPsb(0)
00074    , fNTPbs(0)
00075    , fNTPbb(0)
00076    , fNTSSB(0)
00077    , fNTType(0)
00078    , fUseRuleFitJF(kFALSE)
00079    , fRFNrules(0)
00080    , fRFNendnodes(0)
00081    , fNTrees(0)
00082    , fTreeEveFrac(0)
00083    , fMinFracNEve(0)
00084    , fMaxFracNEve(0)
00085    , fNCuts(0)
00086    , fPruneMethod(TMVA::DecisionTree::kCostComplexityPruning)
00087    , fPruneStrength(0)
00088    , fUseBoost(kFALSE)
00089    , fGDPathEveFrac(0)
00090    , fGDValidEveFrac(0)
00091    , fGDTau(0)
00092    , fGDTauPrec(0)
00093    , fGDTauMin(0)
00094    , fGDTauMax(0)
00095    , fGDTauScan(0)
00096    , fGDPathStep(0)
00097    , fGDNPathSteps(0)
00098    , fGDErrScale(0)
00099    , fMinimp(0)
00100    , fRuleMinDist(0)
00101    , fLinQuantile(0)
00102 {
00103    // standard constructor
00104 }
00105 
00106 //_______________________________________________________________________
00107 TMVA::MethodRuleFit::MethodRuleFit( DataSetInfo& theData,
00108                                     const TString& theWeightFile,
00109                                     TDirectory* theTargetDir ) :
00110    MethodBase( Types::kRuleFit, theData, theWeightFile, theTargetDir )
00111    , fSignalFraction(0)
00112    , fNTImportance(0)
00113    , fNTCoefficient(0)
00114    , fNTSupport(0)
00115    , fNTNcuts(0)
00116    , fNTNvars(0)
00117    , fNTPtag(0)
00118    , fNTPss(0)
00119    , fNTPsb(0)
00120    , fNTPbs(0)
00121    , fNTPbb(0)
00122    , fNTSSB(0)
00123    , fNTType(0)
00124    , fUseRuleFitJF(kFALSE)
00125    , fRFNrules(0)
00126    , fRFNendnodes(0)
00127    , fNTrees(0)
00128    , fTreeEveFrac(0)
00129    , fMinFracNEve(0)
00130    , fMaxFracNEve(0)
00131    , fNCuts(0)
00132    , fPruneMethod(TMVA::DecisionTree::kCostComplexityPruning)
00133    , fPruneStrength(0)
00134    , fUseBoost(kFALSE)
00135    , fGDPathEveFrac(0)
00136    , fGDValidEveFrac(0)
00137    , fGDTau(0)
00138    , fGDTauPrec(0)
00139    , fGDTauMin(0)
00140    , fGDTauMax(0)
00141    , fGDTauScan(0)
00142    , fGDPathStep(0)
00143    , fGDNPathSteps(0)
00144    , fGDErrScale(0)
00145    , fMinimp(0)
00146    , fRuleMinDist(0)
00147    , fLinQuantile(0)
00148 {
00149    // constructor from weight file
00150 }
00151 
00152 //_______________________________________________________________________
00153 TMVA::MethodRuleFit::~MethodRuleFit( void )
00154 {
00155    // destructor
00156    for (UInt_t i=0; i<fEventSample.size(); i++) delete fEventSample[i];
00157    for (UInt_t i=0; i<fForest.size(); i++)      delete fForest[i];
00158 }
00159 
00160 //_______________________________________________________________________
00161 Bool_t TMVA::MethodRuleFit::HasAnalysisType( Types::EAnalysisType type, UInt_t numberClasses, UInt_t /*numberTargets*/ )
00162 {
00163    // RuleFit can handle classification with 2 classes 
00164    if (type == Types::kClassification && numberClasses == 2) return kTRUE;
00165    return kFALSE;
00166 }
00167 
00168 //_______________________________________________________________________
00169 void TMVA::MethodRuleFit::DeclareOptions() 
00170 {
00171    // define the options (their key words) that can be set in the option string 
00172    // know options.
00173    //---------
00174    // general
00175    //---------
00176    // RuleFitModule  <string>     
00177    //    available values are:    RFTMVA      - use TMVA implementation
00178    //                             RFFriedman  - use Friedmans original implementation
00179    //----------------------
00180    // Path search (fitting)
00181    //----------------------
00182    // GDTau          <float>      gradient-directed path: fit threshhold, default
00183    // GDTauPrec      <float>      gradient-directed path: precision of estimated tau
00184    // GDStep         <float>      gradient-directed path: step size       
00185    // GDNSteps       <float>      gradient-directed path: number of steps 
00186    // GDErrScale     <float>      stop scan when error>scale*errmin       
00187    //-----------------
00188    // Tree generation
00189    //-----------------
00190    // fEventsMin     <float>      minimum fraction of events in a splittable node
00191    // fEventsMax     <float>      maximum fraction of events in a splittable node
00192    // nTrees         <float>      number of trees in forest.
00193    // ForestType     <string>
00194    //    available values are:    Random    - create forest using random subsample and only random variables subset at each node
00195    //                             AdaBoost  - create forest with boosted events
00196    //
00197    //-----------------
00198    // Model creation
00199    //-----------------
00200    // RuleMinDist    <float>      min distance allowed between rules
00201    // MinImp         <float>      minimum rule importance accepted        
00202    // Model          <string>     model to be used
00203    //    available values are:    ModRuleLinear <default>
00204    //                             ModRule
00205    //                             ModLinear
00206    //
00207    //-----------------
00208    // Friedmans module
00209    //-----------------
00210    // RFWorkDir      <string>     directory where Friedmans module (rf_go.exe) is installed
00211    // RFNrules       <int>        maximum number of rules allowed
00212    // RFNendnodes    <int>        average number of end nodes in the forest of trees
00213    //
00214    DeclareOptionRef(fGDTau=-1,             "GDTau",          "Gradient-directed (GD) path: default fit cut-off");
00215    DeclareOptionRef(fGDTauPrec=0.01,       "GDTauPrec",      "GD path: precision of tau");
00216    DeclareOptionRef(fGDPathStep=0.01,      "GDStep",         "GD path: step size");
00217    DeclareOptionRef(fGDNPathSteps=10000,   "GDNSteps",       "GD path: number of steps");
00218    DeclareOptionRef(fGDErrScale=1.1,       "GDErrScale",     "Stop scan when error > scale*errmin");
00219    DeclareOptionRef(fLinQuantile,           "LinQuantile",  "Quantile of linear terms (removes outliers)");
00220    DeclareOptionRef(fGDPathEveFrac=0.5,    "GDPathEveFrac",  "Fraction of events used for the path search");
00221    DeclareOptionRef(fGDValidEveFrac=0.5,   "GDValidEveFrac", "Fraction of events used for the validation");
00222    // tree options
00223    DeclareOptionRef(fMinFracNEve=0.1,      "fEventsMin",     "Minimum fraction of events in a splittable node");
00224    DeclareOptionRef(fMaxFracNEve=0.9,      "fEventsMax",     "Maximum fraction of events in a splittable node");
00225    DeclareOptionRef(fNTrees=20,            "nTrees",         "Number of trees in forest.");
00226    
00227    DeclareOptionRef(fForestTypeS="AdaBoost",  "ForestType",   "Method to use for forest generation (AdaBoost or RandomForest)");
00228    AddPreDefVal(TString("AdaBoost"));
00229    AddPreDefVal(TString("Random"));
00230    // rule cleanup options
00231    DeclareOptionRef(fRuleMinDist=0.001,    "RuleMinDist",    "Minimum distance between rules");
00232    DeclareOptionRef(fMinimp=0.01,          "MinImp",         "Minimum rule importance accepted");
00233    // rule model option
00234    DeclareOptionRef(fModelTypeS="ModRuleLinear", "Model",    "Model to be used");
00235    AddPreDefVal(TString("ModRule"));
00236    AddPreDefVal(TString("ModRuleLinear"));
00237    AddPreDefVal(TString("ModLinear"));
00238    DeclareOptionRef(fRuleFitModuleS="RFTMVA",  "RuleFitModule","Which RuleFit module to use");
00239    AddPreDefVal(TString("RFTMVA"));
00240    AddPreDefVal(TString("RFFriedman"));
00241 
00242    DeclareOptionRef(fRFWorkDir="./rulefit", "RFWorkDir",    "Friedman\'s RuleFit module (RFF): working dir");
00243    DeclareOptionRef(fRFNrules=2000,         "RFNrules",     "RFF: Mximum number of rules");
00244    DeclareOptionRef(fRFNendnodes=4,         "RFNendnodes",  "RFF: Average number of end nodes");
00245 }
00246 
00247 //_______________________________________________________________________
00248 void TMVA::MethodRuleFit::ProcessOptions() 
00249 {
00250    // process the options specified by the user   
00251 
00252    if (IgnoreEventsWithNegWeightsInTraining()) {
00253       Log() << kFATAL << "Mechanism to ignore events with negative weights in training not yet available for method: "
00254             << GetMethodTypeName() 
00255             << " --> please remove \"IgnoreNegWeightsInTraining\" option from booking string."
00256             << Endl;
00257    }
00258 
00259    fRuleFitModuleS.ToLower();
00260    if      (fRuleFitModuleS == "rftmva")     fUseRuleFitJF = kFALSE;
00261    else if (fRuleFitModuleS == "rffriedman") fUseRuleFitJF = kTRUE;
00262    else                                      fUseRuleFitJF = kTRUE;
00263 
00264    fSepTypeS.ToLower();
00265    if      (fSepTypeS == "misclassificationerror") fSepType = new MisClassificationError();
00266    else if (fSepTypeS == "giniindex")              fSepType = new GiniIndex();
00267    else if (fSepTypeS == "crossentropy")           fSepType = new CrossEntropy();
00268    else                                            fSepType = new SdivSqrtSplusB();
00269 
00270    fModelTypeS.ToLower();
00271    if      (fModelTypeS == "modlinear" ) fRuleFit.SetModelLinear();
00272    else if (fModelTypeS == "modrule" )   fRuleFit.SetModelRules();
00273    else                                  fRuleFit.SetModelFull();
00274 
00275    fPruneMethodS.ToLower();
00276    if      (fPruneMethodS == "expectederror" )   fPruneMethod  = DecisionTree::kExpectedErrorPruning;
00277    else if (fPruneMethodS == "costcomplexity" )  fPruneMethod  = DecisionTree::kCostComplexityPruning;
00278    else                                          fPruneMethod  = DecisionTree::kNoPruning;
00279 
00280    fForestTypeS.ToLower();
00281    if      (fForestTypeS == "random" )   fUseBoost = kFALSE;
00282    else if (fForestTypeS == "adaboost" ) fUseBoost = kTRUE;
00283    else                                  fUseBoost = kTRUE;
00284    //
00285    // if creating the forest by boosting the events
00286    // the full training sample is used per tree
00287    // -> only true for the TMVA version of RuleFit.
00288    if (fUseBoost && (!fUseRuleFitJF)) fTreeEveFrac = 1.0;
00289 
00290    // check event fraction for tree generation
00291    // if <0 set to automatic number
00292    if (fTreeEveFrac<=0) {
00293       Int_t nevents = Data()->GetNTrainingEvents();
00294       Double_t n = static_cast<Double_t>(nevents);
00295       fTreeEveFrac = min( 0.5, (100.0 +6.0*sqrt(n))/n);
00296    }
00297    // verify ranges of options
00298    VerifyRange(Log(), "nTrees",        fNTrees,0,100000,20);
00299    VerifyRange(Log(), "MinImp",        fMinimp,0.0,1.0,0.0);
00300    VerifyRange(Log(), "GDTauPrec",     fGDTauPrec,1e-5,5e-1);
00301    VerifyRange(Log(), "GDTauMin",      fGDTauMin,0.0,1.0);
00302    VerifyRange(Log(), "GDTauMax",      fGDTauMax,fGDTauMin,1.0);
00303    VerifyRange(Log(), "GDPathStep",    fGDPathStep,0.0,100.0,0.01);
00304    VerifyRange(Log(), "GDErrScale",    fGDErrScale,1.0,100.0,1.1);
00305    VerifyRange(Log(), "GDPathEveFrac", fGDPathEveFrac,0.01,0.9,0.5);
00306    VerifyRange(Log(), "GDValidEveFrac",fGDValidEveFrac,0.01,1.0-fGDPathEveFrac,1.0-fGDPathEveFrac);
00307    VerifyRange(Log(), "fEventsMin",    fMinFracNEve,0.0,1.0);
00308    VerifyRange(Log(), "fEventsMax",    fMaxFracNEve,fMinFracNEve,1.0);
00309 
00310    fRuleFit.GetRuleEnsemblePtr()->SetLinQuantile(fLinQuantile);
00311    fRuleFit.GetRuleFitParamsPtr()->SetGDTauRange(fGDTauMin,fGDTauMax);
00312    fRuleFit.GetRuleFitParamsPtr()->SetGDTau(fGDTau);
00313    fRuleFit.GetRuleFitParamsPtr()->SetGDTauPrec(fGDTauPrec);
00314    fRuleFit.GetRuleFitParamsPtr()->SetGDTauScan(fGDTauScan);
00315    fRuleFit.GetRuleFitParamsPtr()->SetGDPathStep(fGDPathStep);
00316    fRuleFit.GetRuleFitParamsPtr()->SetGDNPathSteps(fGDNPathSteps);
00317    fRuleFit.GetRuleFitParamsPtr()->SetGDErrScale(fGDErrScale);
00318    fRuleFit.SetImportanceCut(fMinimp);
00319    fRuleFit.SetRuleMinDist(fRuleMinDist);
00320 
00321 
00322    // check if Friedmans module is used.
00323    // print a message concerning the options.
00324    if (fUseRuleFitJF) {
00325       Log() << kINFO << "" << Endl;
00326       Log() << kINFO << "--------------------------------------" <<Endl;
00327       Log() << kINFO << "Friedmans RuleFit module is selected." << Endl;
00328       Log() << kINFO << "Only the following options are used:" << Endl;
00329       Log() << kINFO <<  Endl;
00330       Log() << kINFO << gTools().Color("bold") << "   Model"        << gTools().Color("reset") << Endl;
00331       Log() << kINFO << gTools().Color("bold") << "   RFWorkDir"    << gTools().Color("reset") << Endl;
00332       Log() << kINFO << gTools().Color("bold") << "   RFNrules"     << gTools().Color("reset") << Endl;
00333       Log() << kINFO << gTools().Color("bold") << "   RFNendnodes"  << gTools().Color("reset") << Endl;
00334       Log() << kINFO << gTools().Color("bold") << "   GDNPathSteps" << gTools().Color("reset") << Endl;
00335       Log() << kINFO << gTools().Color("bold") << "   GDPathStep"   << gTools().Color("reset") << Endl;
00336       Log() << kINFO << gTools().Color("bold") << "   GDErrScale"   << gTools().Color("reset") << Endl;
00337       Log() << kINFO << "--------------------------------------" <<Endl;
00338       Log() << kINFO << Endl;
00339    }
00340 
00341    // Select what weight to use in the 'importance' rule visualisation plots.
00342    // Note that if UseCoefficientsVisHists() is selected, the following weight is used:
00343    //    w = rule coefficient * rule support
00344    // The support is a positive number which is 0 if no events are accepted by the rule.
00345    // Normally the importance gives more useful information.
00346    //
00347    //fRuleFit.UseCoefficientsVisHists();
00348    fRuleFit.UseImportanceVisHists();
00349 
00350    fRuleFit.SetMsgType( Log().GetMinType() );
00351 
00352    if (HasTrainingTree()) InitEventSample();
00353 
00354 }
00355 
00356 //_______________________________________________________________________
00357 void TMVA::MethodRuleFit::InitMonitorNtuple()
00358 {
00359    // initialize the monitoring ntuple
00360    BaseDir()->cd();
00361    fMonitorNtuple= new TTree("MonitorNtuple_RuleFit","RuleFit variables");
00362    fMonitorNtuple->Branch("importance",&fNTImportance,"importance/D");
00363    fMonitorNtuple->Branch("support",&fNTSupport,"support/D");
00364    fMonitorNtuple->Branch("coefficient",&fNTCoefficient,"coefficient/D");
00365    fMonitorNtuple->Branch("ncuts",&fNTNcuts,"ncuts/I");
00366    fMonitorNtuple->Branch("nvars",&fNTNvars,"nvars/I");
00367    fMonitorNtuple->Branch("type",&fNTType,"type/I");
00368    fMonitorNtuple->Branch("ptag",&fNTPtag,"ptag/D");
00369    fMonitorNtuple->Branch("pss",&fNTPss,"pss/D");
00370    fMonitorNtuple->Branch("psb",&fNTPsb,"psb/D");
00371    fMonitorNtuple->Branch("pbs",&fNTPbs,"pbs/D");
00372    fMonitorNtuple->Branch("pbb",&fNTPbb,"pbb/D");
00373    fMonitorNtuple->Branch("soversb",&fNTSSB,"soversb/D");
00374 }
00375 
00376 //_______________________________________________________________________
00377 void TMVA::MethodRuleFit::Init()
00378 {
00379    // default initialization
00380 
00381    // the minimum requirement to declare an event signal-like
00382    SetSignalReferenceCut( 0.0 );
00383 
00384    // set variables that used to be options
00385    // any modifications are then made in ProcessOptions()
00386    fLinQuantile   = 0.025;       // Quantile of linear terms (remove outliers)
00387    fTreeEveFrac   = -1.0;        // Fraction of events used to train each tree
00388    fNCuts         = 20;          // Number of steps during node cut optimisation
00389    fSepTypeS      = "GiniIndex"; // Separation criterion for node splitting; see BDT
00390    fPruneMethodS  = "NONE";      // Pruning method; see BDT
00391    fPruneStrength = 3.5;         // Pruning strength; see BDT
00392    fGDTauMin      = 0.0;         // Gradient-directed path: min fit threshold (tau)
00393    fGDTauMax      = 1.0;         // Gradient-directed path: max fit threshold (tau)
00394    fGDTauScan     = 1000;        // Gradient-directed path: number of points scanning for best tau
00395 
00396 }
00397 
00398 //_______________________________________________________________________
00399 void TMVA::MethodRuleFit::InitEventSample( void )
00400 {
00401    // write all Events from the Tree into a vector of Events, that are
00402    // more easily manipulated.
00403    // This method should never be called without existing trainingTree, as it
00404    // the vector of events from the ROOT training tree
00405    if (Data()->GetNEvents()==0) Log() << kFATAL << "<Init> Data().TrainingTree() is zero pointer" << Endl;
00406 
00407    Int_t nevents = Data()->GetNEvents();
00408    for (Int_t ievt=0; ievt<nevents; ievt++){
00409       const Event * ev = GetEvent(ievt);
00410       fEventSample.push_back( new Event(*ev));
00411    }
00412    if (fTreeEveFrac<=0) {
00413       Double_t n = static_cast<Double_t>(nevents);
00414       fTreeEveFrac = min( 0.5, (100.0 +6.0*sqrt(n))/n);
00415    }
00416    if (fTreeEveFrac>1.0) fTreeEveFrac=1.0;
00417    //
00418    std::random_shuffle(fEventSample.begin(), fEventSample.end());
00419    //
00420    Log() << kDEBUG << "Set sub-sample fraction to " << fTreeEveFrac << Endl;
00421 }
00422 
00423 //_______________________________________________________________________
00424 void TMVA::MethodRuleFit::Train( void )
00425 {
00426    TMVA::DecisionTreeNode::fgIsTraining=true;
00427    // training of rules
00428 
00429    InitMonitorNtuple();
00430 
00431    // fill the STL Vector with the event sample
00432    this->InitEventSample();
00433 
00434    if (fUseRuleFitJF) {
00435       TrainJFRuleFit();
00436    } 
00437    else {
00438       TrainTMVARuleFit();
00439    }
00440    fRuleFit.GetRuleEnsemblePtr()->ClearRuleMap();
00441    TMVA::DecisionTreeNode::fgIsTraining=false;
00442 }
00443 
00444 //_______________________________________________________________________
00445 void TMVA::MethodRuleFit::TrainTMVARuleFit( void )
00446 {
00447    // training of rules using TMVA implementation
00448 
00449    if (IsNormalised()) Log() << kFATAL << "\"Normalise\" option cannot be used with RuleFit; " 
00450                                << "please remove the optoin from the configuration string, or "
00451                                << "use \"!Normalise\""
00452                                << Endl;
00453 
00454    // timer
00455    Timer timer( 1, GetName() );
00456 
00457    // test tree nmin cut -> for debug purposes
00458    // the routine will generate trees with stopping cut on N(eve) given by
00459    // a fraction between [20,N(eve)-1].
00460    // 
00461    //   MakeForestRnd();
00462    //   exit(1);
00463    //
00464 
00465    // Init RuleFit object and create rule ensemble
00466    // + make forest & rules
00467    fRuleFit.Initialize( this );
00468 
00469    // Make forest of decision trees
00470    //   if (fRuleFit.GetRuleEnsemble().DoRules()) fRuleFit.MakeForest();
00471 
00472    // Fit the rules
00473    Log() << kDEBUG << "Fitting rule coefficients ..." << Endl;
00474    fRuleFit.FitCoefficients();
00475 
00476    // Calculate importance
00477    Log() << kDEBUG << "Computing rule and variable importance" << Endl;
00478    fRuleFit.CalcImportance();
00479 
00480    // Output results and fill monitor ntuple
00481    fRuleFit.GetRuleEnsemblePtr()->Print();
00482    //
00483    Log() << kDEBUG << "Filling rule ntuple" << Endl;
00484    UInt_t nrules = fRuleFit.GetRuleEnsemble().GetRulesConst().size();
00485    const Rule *rule;
00486    for (UInt_t i=0; i<nrules; i++ ) {
00487       rule            = fRuleFit.GetRuleEnsemble().GetRulesConst(i);
00488       fNTImportance   = rule->GetRelImportance();
00489       fNTSupport      = rule->GetSupport();
00490       fNTCoefficient  = rule->GetCoefficient();
00491       fNTType         = (rule->IsSignalRule() ? 1:-1 );
00492       fNTNvars        = rule->GetRuleCut()->GetNvars();
00493       fNTNcuts        = rule->GetRuleCut()->GetNcuts();
00494       fNTPtag         = fRuleFit.GetRuleEnsemble().GetRulePTag(i); // should be identical with support
00495       fNTPss          = fRuleFit.GetRuleEnsemble().GetRulePSS(i);
00496       fNTPsb          = fRuleFit.GetRuleEnsemble().GetRulePSB(i);
00497       fNTPbs          = fRuleFit.GetRuleEnsemble().GetRulePBS(i);
00498       fNTPbb          = fRuleFit.GetRuleEnsemble().GetRulePBB(i);
00499       fNTSSB          = rule->GetSSB();
00500       fMonitorNtuple->Fill();
00501    }
00502    Log() << kDEBUG << "Training done" << Endl;
00503 
00504    fRuleFit.MakeVisHists();
00505 
00506    fRuleFit.MakeDebugHists();
00507 }
00508 
00509 //_______________________________________________________________________
00510 void TMVA::MethodRuleFit::TrainJFRuleFit( void )
00511 {
00512    // training of rules using Jerome Friedmans implementation
00513 
00514    fRuleFit.InitPtrs( this );
00515    fRuleFit.SetTrainingEvents( GetTrainingEvents() );
00516 
00517    RuleFitAPI *rfAPI = new RuleFitAPI( this, &fRuleFit, Log().GetMinType() );
00518 
00519    rfAPI->WelcomeMessage();
00520 
00521    // timer
00522    Timer timer( 1, GetName() );
00523 
00524    Log() << kINFO << "Training ..." << Endl;
00525    rfAPI->TrainRuleFit();
00526 
00527    Log() << kDEBUG << "reading model summary from rf_go.exe output" << Endl;
00528    rfAPI->ReadModelSum();
00529 
00530    //   fRuleFit.GetRuleEnsemblePtr()->MakeRuleMap();
00531 
00532    Log() << kDEBUG << "calculating rule and variable importance" << Endl;
00533    fRuleFit.CalcImportance();
00534 
00535    // Output results and fill monitor ntuple
00536    fRuleFit.GetRuleEnsemblePtr()->Print();
00537    //
00538    fRuleFit.MakeVisHists();
00539 
00540    delete rfAPI;
00541 
00542    Log() << kDEBUG << "done training" << Endl;
00543 }
00544 
00545 //_______________________________________________________________________
00546 const TMVA::Ranking* TMVA::MethodRuleFit::CreateRanking() 
00547 {
00548    // computes ranking of input variables
00549 
00550    // create the ranking object
00551    fRanking = new Ranking( GetName(), "Importance" );
00552 
00553    for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
00554       fRanking->AddRank( Rank( GetInputLabel(ivar), fRuleFit.GetRuleEnsemble().GetVarImportance(ivar) ) );
00555    }
00556 
00557    return fRanking;
00558 }
00559 
00560 //_______________________________________________________________________
00561 void TMVA::MethodRuleFit::AddWeightsXMLTo( void* parent ) const 
00562 {
00563    // add the rules to XML node
00564    fRuleFit.GetRuleEnsemble().AddXMLTo( parent );
00565 }
00566 
00567 //_______________________________________________________________________
00568 void TMVA::MethodRuleFit::ReadWeightsFromStream( istream & istr )
00569 {
00570    // read rules from an istream
00571 
00572    fRuleFit.GetRuleEnsemblePtr()->ReadRaw( istr );
00573 }
00574 
00575 //_______________________________________________________________________
00576 void TMVA::MethodRuleFit::ReadWeightsFromXML( void* wghtnode )
00577 {
00578    // read rules from XML node
00579    fRuleFit.GetRuleEnsemblePtr()->ReadFromXML( wghtnode );
00580 }
00581 
00582 //_______________________________________________________________________
00583 Double_t TMVA::MethodRuleFit::GetMvaValue( Double_t* err, Double_t* errUpper )
00584 {
00585    // returns MVA value for given event
00586 
00587    // cannot determine error
00588    NoErrorCalc(err, errUpper);
00589 
00590    return fRuleFit.EvalEvent( *GetEvent() );
00591 }
00592 
00593 //_______________________________________________________________________
00594 void  TMVA::MethodRuleFit::WriteMonitoringHistosToFile( void ) const
00595 {
00596    // write special monitoring histograms to file (here ntuple)
00597    BaseDir()->cd();
00598    Log() << kINFO << "Write monitoring ntuple to file: " << BaseDir()->GetPath() << Endl;
00599    fMonitorNtuple->Write();
00600 }
00601 
00602 //_______________________________________________________________________
00603 void TMVA::MethodRuleFit::MakeClassSpecific( std::ostream& fout, const TString& className ) const
00604 {
00605    // write specific classifier response
00606    Int_t dp = fout.precision();
00607    fout << "   // not implemented for class: \"" << className << "\"" << std::endl;
00608    fout << "};" << std::endl;
00609    fout << "void   " << className << "::Initialize(){}" << std::endl;
00610    fout << "void   " << className << "::Clear(){}" << std::endl;
00611    fout << "double " << className << "::GetMvaValue__( const std::vector<double>& inputValues ) const {" << std::endl;
00612    fout << "   double rval=" << setprecision(10) << fRuleFit.GetRuleEnsemble().GetOffset() << ";" << std::endl;
00613    MakeClassRuleCuts(fout);
00614    MakeClassLinear(fout);
00615    fout << "   return rval;" << std::endl;
00616    fout << "}" << std::endl;
00617    fout << std::setprecision(dp);
00618 }
00619 
00620 //_______________________________________________________________________
00621 void TMVA::MethodRuleFit::MakeClassRuleCuts( std::ostream& fout ) const
00622 {
00623    // print out the rule cuts
00624    Int_t dp = fout.precision();
00625    if (!fRuleFit.GetRuleEnsemble().DoRules()) {
00626       fout << "   //" << std::endl;
00627       fout << "   // ==> MODEL CONTAINS NO RULES <==" << std::endl;
00628       fout << "   //" << std::endl;
00629       return;
00630    }
00631    const RuleEnsemble *rens = &(fRuleFit.GetRuleEnsemble());
00632    const std::vector< Rule* > *rules = &(rens->GetRulesConst());
00633    const RuleCut *ruleCut;
00634    //
00635    std::list< std::pair<Double_t,Int_t> > sortedRules;
00636    for (UInt_t ir=0; ir<rules->size(); ir++) {
00637       sortedRules.push_back( std::pair<Double_t,Int_t>( (*rules)[ir]->GetImportance()/rens->GetImportanceRef(),ir ) );
00638    }
00639    sortedRules.sort();
00640    //
00641    fout << "   //" << std::endl;
00642    fout << "   // here follows all rules ordered in importance (most important first)" << std::endl;
00643    fout << "   // at the end of each line, the relative importance of the rule is given" << std::endl;
00644    fout << "   //" << std::endl;
00645    //
00646    for ( std::list< std::pair<double,int> >::reverse_iterator itpair = sortedRules.rbegin();
00647          itpair != sortedRules.rend(); itpair++ ) {
00648       UInt_t ir     = itpair->second;
00649       Double_t impr = itpair->first;
00650       ruleCut = (*rules)[ir]->GetRuleCut();
00651       if (impr<rens->GetImportanceCut()) fout << "   //" << std::endl;
00652       fout << "   if (" << std::flush;
00653       for (UInt_t ic=0; ic<ruleCut->GetNvars(); ic++) {
00654          Double_t sel    = ruleCut->GetSelector(ic);
00655          Double_t valmin = ruleCut->GetCutMin(ic);
00656          Double_t valmax = ruleCut->GetCutMax(ic);
00657          Bool_t   domin  = ruleCut->GetCutDoMin(ic);
00658          Bool_t   domax  = ruleCut->GetCutDoMax(ic);
00659          //
00660          if (ic>0) fout << "&&" << std::flush;
00661          if (domin) {
00662             fout << "(" << setprecision(10) << valmin << std::flush;
00663             fout << "<inputValues[" << sel << "])" << std::flush;
00664          }
00665          if (domax) {
00666             if (domin) fout << "&&" << std::flush;
00667             fout << "(inputValues[" << sel << "]" << std::flush;
00668             fout << "<" << setprecision(10) << valmax << ")" <<std::flush;
00669          }
00670       }
00671       fout << ") rval+=" << setprecision(10) << (*rules)[ir]->GetCoefficient() << ";" << std::flush;
00672       fout << "   // importance = " << Form("%3.3f",impr) << std::endl;
00673    }
00674    fout << std::setprecision(dp);
00675 }
00676 
00677 //_______________________________________________________________________
00678 void TMVA::MethodRuleFit::MakeClassLinear( std::ostream& fout ) const
00679 {
00680    // print out the linear terms
00681    if (!fRuleFit.GetRuleEnsemble().DoLinear()) {
00682       fout << "   //" << std::endl;
00683       fout << "   // ==> MODEL CONTAINS NO LINEAR TERMS <==" << std::endl;
00684       fout << "   //" << std::endl;
00685       return;
00686    }
00687    fout << "   //" << std::endl;
00688    fout << "   // here follows all linear terms" << std::endl;
00689    fout << "   // at the end of each line, the relative importance of the term is given" << std::endl;
00690    fout << "   //" << std::endl;
00691    const RuleEnsemble *rens = &(fRuleFit.GetRuleEnsemble());
00692    UInt_t nlin = rens->GetNLinear();
00693    for (UInt_t il=0; il<nlin; il++) {
00694       if (rens->IsLinTermOK(il)) {
00695          Double_t norm = rens->GetLinNorm(il);
00696          Double_t imp  = rens->GetLinImportance(il)/rens->GetImportanceRef();
00697          fout << "   rval+="
00698    //           << setprecision(10) << rens->GetLinCoefficients(il)*norm << "*std::min(" << setprecision(10) << rens->GetLinDP(il)
00699    //           << ", std::max( inputValues[" << il << "]," << setprecision(10) << rens->GetLinDM(il) << "));"
00700               << setprecision(10) << rens->GetLinCoefficients(il)*norm 
00701               << "*std::min( double(" << setprecision(10) << rens->GetLinDP(il)
00702               << "), std::max( double(inputValues[" << il << "]), double(" << setprecision(10) << rens->GetLinDM(il) << ")));"
00703               << std::flush;
00704          fout << "   // importance = " << Form("%3.3f",imp) << std::endl;
00705       }
00706    }
00707 }
00708 
00709 //_______________________________________________________________________
00710 void TMVA::MethodRuleFit::GetHelpMessage() const
00711 {
00712    // get help message text
00713    //
00714    // typical length of text line: 
00715    //         "|--------------------------------------------------------------|"
00716    TString col    = gConfig().WriteOptionsReference() ? "" : gTools().Color("bold");
00717    TString colres = gConfig().WriteOptionsReference() ? "" : gTools().Color("reset");
00718    TString brk    = gConfig().WriteOptionsReference() ? "<br>" : "";
00719 
00720    Log() << Endl;
00721    Log() << col << "--- Short description:" << colres << Endl;
00722    Log() << Endl;
00723    Log() << "This method uses a collection of so called rules to create a" << Endl;
00724    Log() << "discriminating scoring function. Each rule consists of a series" << Endl;
00725    Log() << "of cuts in parameter space. The ensemble of rules are created" << Endl;
00726    Log() << "from a forest of decision trees, trained using the training data." << Endl;
00727    Log() << "Each node (apart from the root) corresponds to one rule." << Endl;
00728    Log() << "The scoring function is then obtained by linearly combining" << Endl;
00729    Log() << "the rules. A fitting procedure is applied to find the optimum" << Endl;
00730    Log() << "set of coefficients. The goal is to find a model with few rules" << Endl;
00731    Log() << "but with a strong discriminating power." << Endl;
00732    Log() << Endl;
00733    Log() << col << "--- Performance optimisation:" << colres << Endl;
00734    Log() << Endl;
00735    Log() << "There are two important considerations to make when optimising:" << Endl;
00736    Log() << Endl;
00737    Log() << "  1. Topology of the decision tree forest" << brk << Endl;
00738    Log() << "  2. Fitting of the coefficients" << Endl;
00739    Log() << Endl;
00740    Log() << "The maximum complexity of the rules is defined by the size of" << Endl;
00741    Log() << "the trees. Large trees will yield many complex rules and capture" << Endl;
00742    Log() << "higher order correlations. On the other hand, small trees will" << Endl;
00743    Log() << "lead to a smaller ensemble with simple rules, only capable of" << Endl;
00744    Log() << "modeling simple structures." << Endl;
00745    Log() << "Several parameters exists for controlling the complexity of the" << Endl;
00746    Log() << "rule ensemble." << Endl;
00747    Log() << Endl;
00748    Log() << "The fitting procedure searches for a minimum using a gradient" << Endl;
00749    Log() << "directed path. Apart from step size and number of steps, the" << Endl;
00750    Log() << "evolution of the path is defined by a cut-off parameter, tau." << Endl;
00751    Log() << "This parameter is unknown and depends on the training data." << Endl;
00752    Log() << "A large value will tend to give large weights to a few rules." << Endl;
00753    Log() << "Similarily, a small value will lead to a large set of rules" << Endl;
00754    Log() << "with similar weights." << Endl;
00755    Log() << Endl;
00756    Log() << "A final point is the model used; rules and/or linear terms." << Endl;
00757    Log() << "For a given training sample, the result may improve by adding" << Endl;
00758    Log() << "linear terms. If best performance is optained using only linear" << Endl;
00759    Log() << "terms, it is very likely that the Fisher discriminant would be" << Endl;
00760    Log() << "a better choice. Ideally the fitting procedure should be able to" << Endl;
00761    Log() << "make this choice by giving appropriate weights for either terms." << Endl;
00762    Log() << Endl;
00763    Log() << col << "--- Performance tuning via configuration options:" << colres << Endl;
00764    Log() << Endl;
00765    Log() << "I.  TUNING OF RULE ENSEMBLE:" << Endl;
00766    Log() << Endl;
00767    Log() << "   " << col << "ForestType  " << colres
00768            << ": Recomended is to use the default \"AdaBoost\"." << brk << Endl;
00769    Log() << "   " << col << "nTrees      " << colres
00770            << ": More trees leads to more rules but also slow" << Endl;
00771    Log() << "                 performance. With too few trees the risk is" << Endl;
00772    Log() << "                 that the rule ensemble becomes too simple." << brk << Endl;
00773    Log() << "   " << col << "fEventsMin  " << colres << brk << Endl;
00774    Log() << "   " << col << "fEventsMax  " << colres
00775            << ": With a lower min, more large trees will be generated" << Endl;
00776    Log() << "                 leading to more complex rules." << Endl;
00777    Log() << "                 With a higher max, more small trees will be" << Endl;
00778    Log() << "                 generated leading to more simple rules." << Endl;
00779    Log() << "                 By changing this range, the average complexity" << Endl;
00780    Log() << "                 of the rule ensemble can be controlled." << brk << Endl;
00781    Log() << "   " << col << "RuleMinDist " << colres
00782            << ": By increasing the minimum distance between" << Endl;
00783    Log() << "                 rules, fewer and more diverse rules will remain." << Endl;
00784    Log() << "                 Initially it is a good idea to keep this small" << Endl;
00785    Log() << "                 or zero and let the fitting do the selection of" << Endl;
00786    Log() << "                 rules. In order to reduce the ensemble size," << Endl;
00787    Log() << "                 the value can then be increased." << Endl;
00788    Log() << Endl;
00789    //         "|--------------------------------------------------------------|"
00790    Log() << "II. TUNING OF THE FITTING:" << Endl;
00791    Log() << Endl;
00792    Log() << "   " << col << "GDPathEveFrac " << colres
00793            << ": fraction of events in path evaluation" << Endl;
00794    Log() << "                 Increasing this fraction will improve the path" << Endl;
00795    Log() << "                 finding. However, a too high value will give few" << Endl;
00796    Log() << "                 unique events available for error estimation." << Endl;
00797    Log() << "                 It is recomended to usethe default = 0.5." << brk << Endl;
00798    Log() << "   " << col << "GDTau         " << colres
00799            << ": cutoff parameter tau" << Endl;
00800    Log() << "                 By default this value is set to -1.0." << Endl;
00801    //         "|----------------|---------------------------------------------|"
00802    Log() << "                 This means that the cut off parameter is" << Endl;
00803    Log() << "                 automatically estimated. In most cases" << Endl;
00804    Log() << "                 this should be fine. However, you may want" << Endl;
00805    Log() << "                 to fix this value if you already know it" << Endl;
00806    Log() << "                 and want to reduce on training time." << brk << Endl;
00807    Log() << "   " << col << "GDTauPrec     " << colres
00808            << ": precision of estimated tau" << Endl;
00809    Log() << "                 Increase this precision to find a more" << Endl;
00810    Log() << "                 optimum cut-off parameter." << brk << Endl;
00811    Log() << "   " << col << "GDNStep       " << colres
00812            << ": number of steps in path search" << Endl;
00813    Log() << "                 If the number of steps is too small, then" << Endl;
00814    Log() << "                 the program will give a warning message." << Endl;
00815    Log() << Endl;
00816    Log() << "III. WARNING MESSAGES" << Endl;
00817    Log() << Endl;
00818    Log() << col << "Risk(i+1)>=Risk(i) in path" << colres << brk << Endl;
00819    Log() << col << "Chaotic behaviour of risk evolution." << colres << Endl;
00820    //         "|----------------|---------------------------------------------|"
00821    Log() << "                 The error rate was still decreasing at the end" << Endl;
00822    Log() << "                 By construction the Risk should always decrease." << Endl;
00823    Log() << "                 However, if the training sample is too small or" << Endl;
00824    Log() << "                 the model is overtrained, such warnings can" << Endl;
00825    Log() << "                 occur." << Endl;
00826    Log() << "                 The warnings can safely be ignored if only a" << Endl;
00827    Log() << "                 few (<3) occur. If more warnings are generated," << Endl;
00828    Log() << "                 the fitting fails." << Endl;
00829    Log() << "                 A remedy may be to increase the value" << brk << Endl;
00830    Log() << "                 "
00831            << col << "GDValidEveFrac" << colres
00832            << " to 1.0 (or a larger value)." << brk << Endl;
00833    Log() << "                 In addition, if "
00834            << col << "GDPathEveFrac" << colres
00835            << " is too high" << Endl;
00836    Log() << "                 the same warnings may occur since the events" << Endl;
00837    Log() << "                 used for error estimation are also used for" << Endl;
00838    Log() << "                 path estimation." << Endl;
00839    Log() << "                 Another possibility is to modify the model - " << Endl;
00840    Log() << "                 See above on tuning the rule ensemble." << Endl;
00841    Log() << Endl;
00842    Log() << col << "The error rate was still decreasing at the end of the path"
00843            << colres << Endl;
00844    Log() << "                 Too few steps in path! Increase "
00845            << col << "GDNSteps" <<  colres << "." << Endl;
00846    Log() << Endl;
00847    Log() << col << "Reached minimum early in the search" << colres << Endl;
00848 
00849    Log() << "                 Minimum was found early in the fitting. This" << Endl;
00850    Log() << "                 may indicate that the used step size "
00851            << col << "GDStep" <<  colres << "." << Endl;
00852    Log() << "                 was too large. Reduce it and rerun." << Endl;
00853    Log() << "                 If the results still are not OK, modify the" << Endl;
00854    Log() << "                 model either by modifying the rule ensemble" << Endl;
00855    Log() << "                 or add/remove linear terms" << Endl;
00856 }

Generated on Tue Jul 5 15:25:04 2011 for ROOT_528-00b_version by  doxygen 1.5.1