00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031 #include <algorithm>
00032 #include <list>
00033
00034 #include "Riostream.h"
00035 #include "TRandom3.h"
00036 #include "TMath.h"
00037 #include "TMatrix.h"
00038 #include "TDirectory.h"
00039
00040 #include "TMVA/ClassifierFactory.h"
00041 #include "TMVA/GiniIndex.h"
00042 #include "TMVA/CrossEntropy.h"
00043 #include "TMVA/SdivSqrtSplusB.h"
00044 #include "TMVA/SeparationBase.h"
00045 #include "TMVA/MisClassificationError.h"
00046 #include "TMVA/MethodRuleFit.h"
00047 #include "TMVA/RuleFitAPI.h"
00048 #include "TMVA/Tools.h"
00049 #include "TMVA/Timer.h"
00050 #include "TMVA/Ranking.h"
00051 #include "TMVA/Config.h"
00052 #include "TMVA/MsgLogger.h"
00053
00054 REGISTER_METHOD(RuleFit)
00055
00056 ClassImp(TMVA::MethodRuleFit)
00057
00058
00059 TMVA::MethodRuleFit::MethodRuleFit( const TString& jobName,
00060 const TString& methodTitle,
00061 DataSetInfo& theData,
00062 const TString& theOption,
00063 TDirectory* theTargetDir ) :
00064 MethodBase( jobName, Types::kRuleFit, methodTitle, theData, theOption, theTargetDir )
00065 , fSignalFraction(0)
00066 , fNTImportance(0)
00067 , fNTCoefficient(0)
00068 , fNTSupport(0)
00069 , fNTNcuts(0)
00070 , fNTNvars(0)
00071 , fNTPtag(0)
00072 , fNTPss(0)
00073 , fNTPsb(0)
00074 , fNTPbs(0)
00075 , fNTPbb(0)
00076 , fNTSSB(0)
00077 , fNTType(0)
00078 , fUseRuleFitJF(kFALSE)
00079 , fRFNrules(0)
00080 , fRFNendnodes(0)
00081 , fNTrees(0)
00082 , fTreeEveFrac(0)
00083 , fMinFracNEve(0)
00084 , fMaxFracNEve(0)
00085 , fNCuts(0)
00086 , fPruneMethod(TMVA::DecisionTree::kCostComplexityPruning)
00087 , fPruneStrength(0)
00088 , fUseBoost(kFALSE)
00089 , fGDPathEveFrac(0)
00090 , fGDValidEveFrac(0)
00091 , fGDTau(0)
00092 , fGDTauPrec(0)
00093 , fGDTauMin(0)
00094 , fGDTauMax(0)
00095 , fGDTauScan(0)
00096 , fGDPathStep(0)
00097 , fGDNPathSteps(0)
00098 , fGDErrScale(0)
00099 , fMinimp(0)
00100 , fRuleMinDist(0)
00101 , fLinQuantile(0)
00102 {
00103
00104 }
00105
00106
00107 TMVA::MethodRuleFit::MethodRuleFit( DataSetInfo& theData,
00108 const TString& theWeightFile,
00109 TDirectory* theTargetDir ) :
00110 MethodBase( Types::kRuleFit, theData, theWeightFile, theTargetDir )
00111 , fSignalFraction(0)
00112 , fNTImportance(0)
00113 , fNTCoefficient(0)
00114 , fNTSupport(0)
00115 , fNTNcuts(0)
00116 , fNTNvars(0)
00117 , fNTPtag(0)
00118 , fNTPss(0)
00119 , fNTPsb(0)
00120 , fNTPbs(0)
00121 , fNTPbb(0)
00122 , fNTSSB(0)
00123 , fNTType(0)
00124 , fUseRuleFitJF(kFALSE)
00125 , fRFNrules(0)
00126 , fRFNendnodes(0)
00127 , fNTrees(0)
00128 , fTreeEveFrac(0)
00129 , fMinFracNEve(0)
00130 , fMaxFracNEve(0)
00131 , fNCuts(0)
00132 , fPruneMethod(TMVA::DecisionTree::kCostComplexityPruning)
00133 , fPruneStrength(0)
00134 , fUseBoost(kFALSE)
00135 , fGDPathEveFrac(0)
00136 , fGDValidEveFrac(0)
00137 , fGDTau(0)
00138 , fGDTauPrec(0)
00139 , fGDTauMin(0)
00140 , fGDTauMax(0)
00141 , fGDTauScan(0)
00142 , fGDPathStep(0)
00143 , fGDNPathSteps(0)
00144 , fGDErrScale(0)
00145 , fMinimp(0)
00146 , fRuleMinDist(0)
00147 , fLinQuantile(0)
00148 {
00149
00150 }
00151
00152
00153 TMVA::MethodRuleFit::~MethodRuleFit( void )
00154 {
00155
00156 for (UInt_t i=0; i<fEventSample.size(); i++) delete fEventSample[i];
00157 for (UInt_t i=0; i<fForest.size(); i++) delete fForest[i];
00158 }
00159
00160
00161 Bool_t TMVA::MethodRuleFit::HasAnalysisType( Types::EAnalysisType type, UInt_t numberClasses, UInt_t )
00162 {
00163
00164 if (type == Types::kClassification && numberClasses == 2) return kTRUE;
00165 return kFALSE;
00166 }
00167
00168
00169 void TMVA::MethodRuleFit::DeclareOptions()
00170 {
00171
00172
00173
00174
00175
00176
00177
00178
00179
00180
00181
00182
00183
00184
00185
00186
00187
00188
00189
00190
00191
00192
00193
00194
00195
00196
00197
00198
00199
00200
00201
00202
00203
00204
00205
00206
00207
00208
00209
00210
00211
00212
00213
00214 DeclareOptionRef(fGDTau=-1, "GDTau", "Gradient-directed (GD) path: default fit cut-off");
00215 DeclareOptionRef(fGDTauPrec=0.01, "GDTauPrec", "GD path: precision of tau");
00216 DeclareOptionRef(fGDPathStep=0.01, "GDStep", "GD path: step size");
00217 DeclareOptionRef(fGDNPathSteps=10000, "GDNSteps", "GD path: number of steps");
00218 DeclareOptionRef(fGDErrScale=1.1, "GDErrScale", "Stop scan when error > scale*errmin");
00219 DeclareOptionRef(fLinQuantile, "LinQuantile", "Quantile of linear terms (removes outliers)");
00220 DeclareOptionRef(fGDPathEveFrac=0.5, "GDPathEveFrac", "Fraction of events used for the path search");
00221 DeclareOptionRef(fGDValidEveFrac=0.5, "GDValidEveFrac", "Fraction of events used for the validation");
00222
00223 DeclareOptionRef(fMinFracNEve=0.1, "fEventsMin", "Minimum fraction of events in a splittable node");
00224 DeclareOptionRef(fMaxFracNEve=0.9, "fEventsMax", "Maximum fraction of events in a splittable node");
00225 DeclareOptionRef(fNTrees=20, "nTrees", "Number of trees in forest.");
00226
00227 DeclareOptionRef(fForestTypeS="AdaBoost", "ForestType", "Method to use for forest generation (AdaBoost or RandomForest)");
00228 AddPreDefVal(TString("AdaBoost"));
00229 AddPreDefVal(TString("Random"));
00230
00231 DeclareOptionRef(fRuleMinDist=0.001, "RuleMinDist", "Minimum distance between rules");
00232 DeclareOptionRef(fMinimp=0.01, "MinImp", "Minimum rule importance accepted");
00233
00234 DeclareOptionRef(fModelTypeS="ModRuleLinear", "Model", "Model to be used");
00235 AddPreDefVal(TString("ModRule"));
00236 AddPreDefVal(TString("ModRuleLinear"));
00237 AddPreDefVal(TString("ModLinear"));
00238 DeclareOptionRef(fRuleFitModuleS="RFTMVA", "RuleFitModule","Which RuleFit module to use");
00239 AddPreDefVal(TString("RFTMVA"));
00240 AddPreDefVal(TString("RFFriedman"));
00241
00242 DeclareOptionRef(fRFWorkDir="./rulefit", "RFWorkDir", "Friedman\'s RuleFit module (RFF): working dir");
00243 DeclareOptionRef(fRFNrules=2000, "RFNrules", "RFF: Mximum number of rules");
00244 DeclareOptionRef(fRFNendnodes=4, "RFNendnodes", "RFF: Average number of end nodes");
00245 }
00246
00247
00248 void TMVA::MethodRuleFit::ProcessOptions()
00249 {
00250
00251
00252 if (IgnoreEventsWithNegWeightsInTraining()) {
00253 Log() << kFATAL << "Mechanism to ignore events with negative weights in training not yet available for method: "
00254 << GetMethodTypeName()
00255 << " --> please remove \"IgnoreNegWeightsInTraining\" option from booking string."
00256 << Endl;
00257 }
00258
00259 fRuleFitModuleS.ToLower();
00260 if (fRuleFitModuleS == "rftmva") fUseRuleFitJF = kFALSE;
00261 else if (fRuleFitModuleS == "rffriedman") fUseRuleFitJF = kTRUE;
00262 else fUseRuleFitJF = kTRUE;
00263
00264 fSepTypeS.ToLower();
00265 if (fSepTypeS == "misclassificationerror") fSepType = new MisClassificationError();
00266 else if (fSepTypeS == "giniindex") fSepType = new GiniIndex();
00267 else if (fSepTypeS == "crossentropy") fSepType = new CrossEntropy();
00268 else fSepType = new SdivSqrtSplusB();
00269
00270 fModelTypeS.ToLower();
00271 if (fModelTypeS == "modlinear" ) fRuleFit.SetModelLinear();
00272 else if (fModelTypeS == "modrule" ) fRuleFit.SetModelRules();
00273 else fRuleFit.SetModelFull();
00274
00275 fPruneMethodS.ToLower();
00276 if (fPruneMethodS == "expectederror" ) fPruneMethod = DecisionTree::kExpectedErrorPruning;
00277 else if (fPruneMethodS == "costcomplexity" ) fPruneMethod = DecisionTree::kCostComplexityPruning;
00278 else fPruneMethod = DecisionTree::kNoPruning;
00279
00280 fForestTypeS.ToLower();
00281 if (fForestTypeS == "random" ) fUseBoost = kFALSE;
00282 else if (fForestTypeS == "adaboost" ) fUseBoost = kTRUE;
00283 else fUseBoost = kTRUE;
00284
00285
00286
00287
00288 if (fUseBoost && (!fUseRuleFitJF)) fTreeEveFrac = 1.0;
00289
00290
00291
00292 if (fTreeEveFrac<=0) {
00293 Int_t nevents = Data()->GetNTrainingEvents();
00294 Double_t n = static_cast<Double_t>(nevents);
00295 fTreeEveFrac = min( 0.5, (100.0 +6.0*sqrt(n))/n);
00296 }
00297
00298 VerifyRange(Log(), "nTrees", fNTrees,0,100000,20);
00299 VerifyRange(Log(), "MinImp", fMinimp,0.0,1.0,0.0);
00300 VerifyRange(Log(), "GDTauPrec", fGDTauPrec,1e-5,5e-1);
00301 VerifyRange(Log(), "GDTauMin", fGDTauMin,0.0,1.0);
00302 VerifyRange(Log(), "GDTauMax", fGDTauMax,fGDTauMin,1.0);
00303 VerifyRange(Log(), "GDPathStep", fGDPathStep,0.0,100.0,0.01);
00304 VerifyRange(Log(), "GDErrScale", fGDErrScale,1.0,100.0,1.1);
00305 VerifyRange(Log(), "GDPathEveFrac", fGDPathEveFrac,0.01,0.9,0.5);
00306 VerifyRange(Log(), "GDValidEveFrac",fGDValidEveFrac,0.01,1.0-fGDPathEveFrac,1.0-fGDPathEveFrac);
00307 VerifyRange(Log(), "fEventsMin", fMinFracNEve,0.0,1.0);
00308 VerifyRange(Log(), "fEventsMax", fMaxFracNEve,fMinFracNEve,1.0);
00309
00310 fRuleFit.GetRuleEnsemblePtr()->SetLinQuantile(fLinQuantile);
00311 fRuleFit.GetRuleFitParamsPtr()->SetGDTauRange(fGDTauMin,fGDTauMax);
00312 fRuleFit.GetRuleFitParamsPtr()->SetGDTau(fGDTau);
00313 fRuleFit.GetRuleFitParamsPtr()->SetGDTauPrec(fGDTauPrec);
00314 fRuleFit.GetRuleFitParamsPtr()->SetGDTauScan(fGDTauScan);
00315 fRuleFit.GetRuleFitParamsPtr()->SetGDPathStep(fGDPathStep);
00316 fRuleFit.GetRuleFitParamsPtr()->SetGDNPathSteps(fGDNPathSteps);
00317 fRuleFit.GetRuleFitParamsPtr()->SetGDErrScale(fGDErrScale);
00318 fRuleFit.SetImportanceCut(fMinimp);
00319 fRuleFit.SetRuleMinDist(fRuleMinDist);
00320
00321
00322
00323
00324 if (fUseRuleFitJF) {
00325 Log() << kINFO << "" << Endl;
00326 Log() << kINFO << "--------------------------------------" <<Endl;
00327 Log() << kINFO << "Friedmans RuleFit module is selected." << Endl;
00328 Log() << kINFO << "Only the following options are used:" << Endl;
00329 Log() << kINFO << Endl;
00330 Log() << kINFO << gTools().Color("bold") << " Model" << gTools().Color("reset") << Endl;
00331 Log() << kINFO << gTools().Color("bold") << " RFWorkDir" << gTools().Color("reset") << Endl;
00332 Log() << kINFO << gTools().Color("bold") << " RFNrules" << gTools().Color("reset") << Endl;
00333 Log() << kINFO << gTools().Color("bold") << " RFNendnodes" << gTools().Color("reset") << Endl;
00334 Log() << kINFO << gTools().Color("bold") << " GDNPathSteps" << gTools().Color("reset") << Endl;
00335 Log() << kINFO << gTools().Color("bold") << " GDPathStep" << gTools().Color("reset") << Endl;
00336 Log() << kINFO << gTools().Color("bold") << " GDErrScale" << gTools().Color("reset") << Endl;
00337 Log() << kINFO << "--------------------------------------" <<Endl;
00338 Log() << kINFO << Endl;
00339 }
00340
00341
00342
00343
00344
00345
00346
00347
00348 fRuleFit.UseImportanceVisHists();
00349
00350 fRuleFit.SetMsgType( Log().GetMinType() );
00351
00352 if (HasTrainingTree()) InitEventSample();
00353
00354 }
00355
00356
00357 void TMVA::MethodRuleFit::InitMonitorNtuple()
00358 {
00359
00360 BaseDir()->cd();
00361 fMonitorNtuple= new TTree("MonitorNtuple_RuleFit","RuleFit variables");
00362 fMonitorNtuple->Branch("importance",&fNTImportance,"importance/D");
00363 fMonitorNtuple->Branch("support",&fNTSupport,"support/D");
00364 fMonitorNtuple->Branch("coefficient",&fNTCoefficient,"coefficient/D");
00365 fMonitorNtuple->Branch("ncuts",&fNTNcuts,"ncuts/I");
00366 fMonitorNtuple->Branch("nvars",&fNTNvars,"nvars/I");
00367 fMonitorNtuple->Branch("type",&fNTType,"type/I");
00368 fMonitorNtuple->Branch("ptag",&fNTPtag,"ptag/D");
00369 fMonitorNtuple->Branch("pss",&fNTPss,"pss/D");
00370 fMonitorNtuple->Branch("psb",&fNTPsb,"psb/D");
00371 fMonitorNtuple->Branch("pbs",&fNTPbs,"pbs/D");
00372 fMonitorNtuple->Branch("pbb",&fNTPbb,"pbb/D");
00373 fMonitorNtuple->Branch("soversb",&fNTSSB,"soversb/D");
00374 }
00375
00376
00377 void TMVA::MethodRuleFit::Init()
00378 {
00379
00380
00381
00382 SetSignalReferenceCut( 0.0 );
00383
00384
00385
00386 fLinQuantile = 0.025;
00387 fTreeEveFrac = -1.0;
00388 fNCuts = 20;
00389 fSepTypeS = "GiniIndex";
00390 fPruneMethodS = "NONE";
00391 fPruneStrength = 3.5;
00392 fGDTauMin = 0.0;
00393 fGDTauMax = 1.0;
00394 fGDTauScan = 1000;
00395
00396 }
00397
00398
00399 void TMVA::MethodRuleFit::InitEventSample( void )
00400 {
00401
00402
00403
00404
00405 if (Data()->GetNEvents()==0) Log() << kFATAL << "<Init> Data().TrainingTree() is zero pointer" << Endl;
00406
00407 Int_t nevents = Data()->GetNEvents();
00408 for (Int_t ievt=0; ievt<nevents; ievt++){
00409 const Event * ev = GetEvent(ievt);
00410 fEventSample.push_back( new Event(*ev));
00411 }
00412 if (fTreeEveFrac<=0) {
00413 Double_t n = static_cast<Double_t>(nevents);
00414 fTreeEveFrac = min( 0.5, (100.0 +6.0*sqrt(n))/n);
00415 }
00416 if (fTreeEveFrac>1.0) fTreeEveFrac=1.0;
00417
00418 std::random_shuffle(fEventSample.begin(), fEventSample.end());
00419
00420 Log() << kDEBUG << "Set sub-sample fraction to " << fTreeEveFrac << Endl;
00421 }
00422
00423
00424 void TMVA::MethodRuleFit::Train( void )
00425 {
00426 TMVA::DecisionTreeNode::fgIsTraining=true;
00427
00428
00429 InitMonitorNtuple();
00430
00431
00432 this->InitEventSample();
00433
00434 if (fUseRuleFitJF) {
00435 TrainJFRuleFit();
00436 }
00437 else {
00438 TrainTMVARuleFit();
00439 }
00440 fRuleFit.GetRuleEnsemblePtr()->ClearRuleMap();
00441 TMVA::DecisionTreeNode::fgIsTraining=false;
00442 }
00443
00444
00445 void TMVA::MethodRuleFit::TrainTMVARuleFit( void )
00446 {
00447
00448
00449 if (IsNormalised()) Log() << kFATAL << "\"Normalise\" option cannot be used with RuleFit; "
00450 << "please remove the optoin from the configuration string, or "
00451 << "use \"!Normalise\""
00452 << Endl;
00453
00454
00455 Timer timer( 1, GetName() );
00456
00457
00458
00459
00460
00461
00462
00463
00464
00465
00466
00467 fRuleFit.Initialize( this );
00468
00469
00470
00471
00472
00473 Log() << kDEBUG << "Fitting rule coefficients ..." << Endl;
00474 fRuleFit.FitCoefficients();
00475
00476
00477 Log() << kDEBUG << "Computing rule and variable importance" << Endl;
00478 fRuleFit.CalcImportance();
00479
00480
00481 fRuleFit.GetRuleEnsemblePtr()->Print();
00482
00483 Log() << kDEBUG << "Filling rule ntuple" << Endl;
00484 UInt_t nrules = fRuleFit.GetRuleEnsemble().GetRulesConst().size();
00485 const Rule *rule;
00486 for (UInt_t i=0; i<nrules; i++ ) {
00487 rule = fRuleFit.GetRuleEnsemble().GetRulesConst(i);
00488 fNTImportance = rule->GetRelImportance();
00489 fNTSupport = rule->GetSupport();
00490 fNTCoefficient = rule->GetCoefficient();
00491 fNTType = (rule->IsSignalRule() ? 1:-1 );
00492 fNTNvars = rule->GetRuleCut()->GetNvars();
00493 fNTNcuts = rule->GetRuleCut()->GetNcuts();
00494 fNTPtag = fRuleFit.GetRuleEnsemble().GetRulePTag(i);
00495 fNTPss = fRuleFit.GetRuleEnsemble().GetRulePSS(i);
00496 fNTPsb = fRuleFit.GetRuleEnsemble().GetRulePSB(i);
00497 fNTPbs = fRuleFit.GetRuleEnsemble().GetRulePBS(i);
00498 fNTPbb = fRuleFit.GetRuleEnsemble().GetRulePBB(i);
00499 fNTSSB = rule->GetSSB();
00500 fMonitorNtuple->Fill();
00501 }
00502 Log() << kDEBUG << "Training done" << Endl;
00503
00504 fRuleFit.MakeVisHists();
00505
00506 fRuleFit.MakeDebugHists();
00507 }
00508
00509
00510 void TMVA::MethodRuleFit::TrainJFRuleFit( void )
00511 {
00512
00513
00514 fRuleFit.InitPtrs( this );
00515 fRuleFit.SetTrainingEvents( GetTrainingEvents() );
00516
00517 RuleFitAPI *rfAPI = new RuleFitAPI( this, &fRuleFit, Log().GetMinType() );
00518
00519 rfAPI->WelcomeMessage();
00520
00521
00522 Timer timer( 1, GetName() );
00523
00524 Log() << kINFO << "Training ..." << Endl;
00525 rfAPI->TrainRuleFit();
00526
00527 Log() << kDEBUG << "reading model summary from rf_go.exe output" << Endl;
00528 rfAPI->ReadModelSum();
00529
00530
00531
00532 Log() << kDEBUG << "calculating rule and variable importance" << Endl;
00533 fRuleFit.CalcImportance();
00534
00535
00536 fRuleFit.GetRuleEnsemblePtr()->Print();
00537
00538 fRuleFit.MakeVisHists();
00539
00540 delete rfAPI;
00541
00542 Log() << kDEBUG << "done training" << Endl;
00543 }
00544
00545
00546 const TMVA::Ranking* TMVA::MethodRuleFit::CreateRanking()
00547 {
00548
00549
00550
00551 fRanking = new Ranking( GetName(), "Importance" );
00552
00553 for (UInt_t ivar=0; ivar<GetNvar(); ivar++) {
00554 fRanking->AddRank( Rank( GetInputLabel(ivar), fRuleFit.GetRuleEnsemble().GetVarImportance(ivar) ) );
00555 }
00556
00557 return fRanking;
00558 }
00559
00560
00561 void TMVA::MethodRuleFit::AddWeightsXMLTo( void* parent ) const
00562 {
00563
00564 fRuleFit.GetRuleEnsemble().AddXMLTo( parent );
00565 }
00566
00567
00568 void TMVA::MethodRuleFit::ReadWeightsFromStream( istream & istr )
00569 {
00570
00571
00572 fRuleFit.GetRuleEnsemblePtr()->ReadRaw( istr );
00573 }
00574
00575
00576 void TMVA::MethodRuleFit::ReadWeightsFromXML( void* wghtnode )
00577 {
00578
00579 fRuleFit.GetRuleEnsemblePtr()->ReadFromXML( wghtnode );
00580 }
00581
00582
00583 Double_t TMVA::MethodRuleFit::GetMvaValue( Double_t* err, Double_t* errUpper )
00584 {
00585
00586
00587
00588 NoErrorCalc(err, errUpper);
00589
00590 return fRuleFit.EvalEvent( *GetEvent() );
00591 }
00592
00593
00594 void TMVA::MethodRuleFit::WriteMonitoringHistosToFile( void ) const
00595 {
00596
00597 BaseDir()->cd();
00598 Log() << kINFO << "Write monitoring ntuple to file: " << BaseDir()->GetPath() << Endl;
00599 fMonitorNtuple->Write();
00600 }
00601
00602
00603 void TMVA::MethodRuleFit::MakeClassSpecific( std::ostream& fout, const TString& className ) const
00604 {
00605
00606 Int_t dp = fout.precision();
00607 fout << " // not implemented for class: \"" << className << "\"" << std::endl;
00608 fout << "};" << std::endl;
00609 fout << "void " << className << "::Initialize(){}" << std::endl;
00610 fout << "void " << className << "::Clear(){}" << std::endl;
00611 fout << "double " << className << "::GetMvaValue__( const std::vector<double>& inputValues ) const {" << std::endl;
00612 fout << " double rval=" << setprecision(10) << fRuleFit.GetRuleEnsemble().GetOffset() << ";" << std::endl;
00613 MakeClassRuleCuts(fout);
00614 MakeClassLinear(fout);
00615 fout << " return rval;" << std::endl;
00616 fout << "}" << std::endl;
00617 fout << std::setprecision(dp);
00618 }
00619
00620
00621 void TMVA::MethodRuleFit::MakeClassRuleCuts( std::ostream& fout ) const
00622 {
00623
00624 Int_t dp = fout.precision();
00625 if (!fRuleFit.GetRuleEnsemble().DoRules()) {
00626 fout << " //" << std::endl;
00627 fout << " // ==> MODEL CONTAINS NO RULES <==" << std::endl;
00628 fout << " //" << std::endl;
00629 return;
00630 }
00631 const RuleEnsemble *rens = &(fRuleFit.GetRuleEnsemble());
00632 const std::vector< Rule* > *rules = &(rens->GetRulesConst());
00633 const RuleCut *ruleCut;
00634
00635 std::list< std::pair<Double_t,Int_t> > sortedRules;
00636 for (UInt_t ir=0; ir<rules->size(); ir++) {
00637 sortedRules.push_back( std::pair<Double_t,Int_t>( (*rules)[ir]->GetImportance()/rens->GetImportanceRef(),ir ) );
00638 }
00639 sortedRules.sort();
00640
00641 fout << " //" << std::endl;
00642 fout << " // here follows all rules ordered in importance (most important first)" << std::endl;
00643 fout << " // at the end of each line, the relative importance of the rule is given" << std::endl;
00644 fout << " //" << std::endl;
00645
00646 for ( std::list< std::pair<double,int> >::reverse_iterator itpair = sortedRules.rbegin();
00647 itpair != sortedRules.rend(); itpair++ ) {
00648 UInt_t ir = itpair->second;
00649 Double_t impr = itpair->first;
00650 ruleCut = (*rules)[ir]->GetRuleCut();
00651 if (impr<rens->GetImportanceCut()) fout << " //" << std::endl;
00652 fout << " if (" << std::flush;
00653 for (UInt_t ic=0; ic<ruleCut->GetNvars(); ic++) {
00654 Double_t sel = ruleCut->GetSelector(ic);
00655 Double_t valmin = ruleCut->GetCutMin(ic);
00656 Double_t valmax = ruleCut->GetCutMax(ic);
00657 Bool_t domin = ruleCut->GetCutDoMin(ic);
00658 Bool_t domax = ruleCut->GetCutDoMax(ic);
00659
00660 if (ic>0) fout << "&&" << std::flush;
00661 if (domin) {
00662 fout << "(" << setprecision(10) << valmin << std::flush;
00663 fout << "<inputValues[" << sel << "])" << std::flush;
00664 }
00665 if (domax) {
00666 if (domin) fout << "&&" << std::flush;
00667 fout << "(inputValues[" << sel << "]" << std::flush;
00668 fout << "<" << setprecision(10) << valmax << ")" <<std::flush;
00669 }
00670 }
00671 fout << ") rval+=" << setprecision(10) << (*rules)[ir]->GetCoefficient() << ";" << std::flush;
00672 fout << " // importance = " << Form("%3.3f",impr) << std::endl;
00673 }
00674 fout << std::setprecision(dp);
00675 }
00676
00677
00678 void TMVA::MethodRuleFit::MakeClassLinear( std::ostream& fout ) const
00679 {
00680
00681 if (!fRuleFit.GetRuleEnsemble().DoLinear()) {
00682 fout << " //" << std::endl;
00683 fout << " // ==> MODEL CONTAINS NO LINEAR TERMS <==" << std::endl;
00684 fout << " //" << std::endl;
00685 return;
00686 }
00687 fout << " //" << std::endl;
00688 fout << " // here follows all linear terms" << std::endl;
00689 fout << " // at the end of each line, the relative importance of the term is given" << std::endl;
00690 fout << " //" << std::endl;
00691 const RuleEnsemble *rens = &(fRuleFit.GetRuleEnsemble());
00692 UInt_t nlin = rens->GetNLinear();
00693 for (UInt_t il=0; il<nlin; il++) {
00694 if (rens->IsLinTermOK(il)) {
00695 Double_t norm = rens->GetLinNorm(il);
00696 Double_t imp = rens->GetLinImportance(il)/rens->GetImportanceRef();
00697 fout << " rval+="
00698
00699
00700 << setprecision(10) << rens->GetLinCoefficients(il)*norm
00701 << "*std::min( double(" << setprecision(10) << rens->GetLinDP(il)
00702 << "), std::max( double(inputValues[" << il << "]), double(" << setprecision(10) << rens->GetLinDM(il) << ")));"
00703 << std::flush;
00704 fout << " // importance = " << Form("%3.3f",imp) << std::endl;
00705 }
00706 }
00707 }
00708
00709
00710 void TMVA::MethodRuleFit::GetHelpMessage() const
00711 {
00712
00713
00714
00715
00716 TString col = gConfig().WriteOptionsReference() ? "" : gTools().Color("bold");
00717 TString colres = gConfig().WriteOptionsReference() ? "" : gTools().Color("reset");
00718 TString brk = gConfig().WriteOptionsReference() ? "<br>" : "";
00719
00720 Log() << Endl;
00721 Log() << col << "--- Short description:" << colres << Endl;
00722 Log() << Endl;
00723 Log() << "This method uses a collection of so called rules to create a" << Endl;
00724 Log() << "discriminating scoring function. Each rule consists of a series" << Endl;
00725 Log() << "of cuts in parameter space. The ensemble of rules are created" << Endl;
00726 Log() << "from a forest of decision trees, trained using the training data." << Endl;
00727 Log() << "Each node (apart from the root) corresponds to one rule." << Endl;
00728 Log() << "The scoring function is then obtained by linearly combining" << Endl;
00729 Log() << "the rules. A fitting procedure is applied to find the optimum" << Endl;
00730 Log() << "set of coefficients. The goal is to find a model with few rules" << Endl;
00731 Log() << "but with a strong discriminating power." << Endl;
00732 Log() << Endl;
00733 Log() << col << "--- Performance optimisation:" << colres << Endl;
00734 Log() << Endl;
00735 Log() << "There are two important considerations to make when optimising:" << Endl;
00736 Log() << Endl;
00737 Log() << " 1. Topology of the decision tree forest" << brk << Endl;
00738 Log() << " 2. Fitting of the coefficients" << Endl;
00739 Log() << Endl;
00740 Log() << "The maximum complexity of the rules is defined by the size of" << Endl;
00741 Log() << "the trees. Large trees will yield many complex rules and capture" << Endl;
00742 Log() << "higher order correlations. On the other hand, small trees will" << Endl;
00743 Log() << "lead to a smaller ensemble with simple rules, only capable of" << Endl;
00744 Log() << "modeling simple structures." << Endl;
00745 Log() << "Several parameters exists for controlling the complexity of the" << Endl;
00746 Log() << "rule ensemble." << Endl;
00747 Log() << Endl;
00748 Log() << "The fitting procedure searches for a minimum using a gradient" << Endl;
00749 Log() << "directed path. Apart from step size and number of steps, the" << Endl;
00750 Log() << "evolution of the path is defined by a cut-off parameter, tau." << Endl;
00751 Log() << "This parameter is unknown and depends on the training data." << Endl;
00752 Log() << "A large value will tend to give large weights to a few rules." << Endl;
00753 Log() << "Similarily, a small value will lead to a large set of rules" << Endl;
00754 Log() << "with similar weights." << Endl;
00755 Log() << Endl;
00756 Log() << "A final point is the model used; rules and/or linear terms." << Endl;
00757 Log() << "For a given training sample, the result may improve by adding" << Endl;
00758 Log() << "linear terms. If best performance is optained using only linear" << Endl;
00759 Log() << "terms, it is very likely that the Fisher discriminant would be" << Endl;
00760 Log() << "a better choice. Ideally the fitting procedure should be able to" << Endl;
00761 Log() << "make this choice by giving appropriate weights for either terms." << Endl;
00762 Log() << Endl;
00763 Log() << col << "--- Performance tuning via configuration options:" << colres << Endl;
00764 Log() << Endl;
00765 Log() << "I. TUNING OF RULE ENSEMBLE:" << Endl;
00766 Log() << Endl;
00767 Log() << " " << col << "ForestType " << colres
00768 << ": Recomended is to use the default \"AdaBoost\"." << brk << Endl;
00769 Log() << " " << col << "nTrees " << colres
00770 << ": More trees leads to more rules but also slow" << Endl;
00771 Log() << " performance. With too few trees the risk is" << Endl;
00772 Log() << " that the rule ensemble becomes too simple." << brk << Endl;
00773 Log() << " " << col << "fEventsMin " << colres << brk << Endl;
00774 Log() << " " << col << "fEventsMax " << colres
00775 << ": With a lower min, more large trees will be generated" << Endl;
00776 Log() << " leading to more complex rules." << Endl;
00777 Log() << " With a higher max, more small trees will be" << Endl;
00778 Log() << " generated leading to more simple rules." << Endl;
00779 Log() << " By changing this range, the average complexity" << Endl;
00780 Log() << " of the rule ensemble can be controlled." << brk << Endl;
00781 Log() << " " << col << "RuleMinDist " << colres
00782 << ": By increasing the minimum distance between" << Endl;
00783 Log() << " rules, fewer and more diverse rules will remain." << Endl;
00784 Log() << " Initially it is a good idea to keep this small" << Endl;
00785 Log() << " or zero and let the fitting do the selection of" << Endl;
00786 Log() << " rules. In order to reduce the ensemble size," << Endl;
00787 Log() << " the value can then be increased." << Endl;
00788 Log() << Endl;
00789
00790 Log() << "II. TUNING OF THE FITTING:" << Endl;
00791 Log() << Endl;
00792 Log() << " " << col << "GDPathEveFrac " << colres
00793 << ": fraction of events in path evaluation" << Endl;
00794 Log() << " Increasing this fraction will improve the path" << Endl;
00795 Log() << " finding. However, a too high value will give few" << Endl;
00796 Log() << " unique events available for error estimation." << Endl;
00797 Log() << " It is recomended to usethe default = 0.5." << brk << Endl;
00798 Log() << " " << col << "GDTau " << colres
00799 << ": cutoff parameter tau" << Endl;
00800 Log() << " By default this value is set to -1.0." << Endl;
00801
00802 Log() << " This means that the cut off parameter is" << Endl;
00803 Log() << " automatically estimated. In most cases" << Endl;
00804 Log() << " this should be fine. However, you may want" << Endl;
00805 Log() << " to fix this value if you already know it" << Endl;
00806 Log() << " and want to reduce on training time." << brk << Endl;
00807 Log() << " " << col << "GDTauPrec " << colres
00808 << ": precision of estimated tau" << Endl;
00809 Log() << " Increase this precision to find a more" << Endl;
00810 Log() << " optimum cut-off parameter." << brk << Endl;
00811 Log() << " " << col << "GDNStep " << colres
00812 << ": number of steps in path search" << Endl;
00813 Log() << " If the number of steps is too small, then" << Endl;
00814 Log() << " the program will give a warning message." << Endl;
00815 Log() << Endl;
00816 Log() << "III. WARNING MESSAGES" << Endl;
00817 Log() << Endl;
00818 Log() << col << "Risk(i+1)>=Risk(i) in path" << colres << brk << Endl;
00819 Log() << col << "Chaotic behaviour of risk evolution." << colres << Endl;
00820
00821 Log() << " The error rate was still decreasing at the end" << Endl;
00822 Log() << " By construction the Risk should always decrease." << Endl;
00823 Log() << " However, if the training sample is too small or" << Endl;
00824 Log() << " the model is overtrained, such warnings can" << Endl;
00825 Log() << " occur." << Endl;
00826 Log() << " The warnings can safely be ignored if only a" << Endl;
00827 Log() << " few (<3) occur. If more warnings are generated," << Endl;
00828 Log() << " the fitting fails." << Endl;
00829 Log() << " A remedy may be to increase the value" << brk << Endl;
00830 Log() << " "
00831 << col << "GDValidEveFrac" << colres
00832 << " to 1.0 (or a larger value)." << brk << Endl;
00833 Log() << " In addition, if "
00834 << col << "GDPathEveFrac" << colres
00835 << " is too high" << Endl;
00836 Log() << " the same warnings may occur since the events" << Endl;
00837 Log() << " used for error estimation are also used for" << Endl;
00838 Log() << " path estimation." << Endl;
00839 Log() << " Another possibility is to modify the model - " << Endl;
00840 Log() << " See above on tuning the rule ensemble." << Endl;
00841 Log() << Endl;
00842 Log() << col << "The error rate was still decreasing at the end of the path"
00843 << colres << Endl;
00844 Log() << " Too few steps in path! Increase "
00845 << col << "GDNSteps" << colres << "." << Endl;
00846 Log() << Endl;
00847 Log() << col << "Reached minimum early in the search" << colres << Endl;
00848
00849 Log() << " Minimum was found early in the fitting. This" << Endl;
00850 Log() << " may indicate that the used step size "
00851 << col << "GDStep" << colres << "." << Endl;
00852 Log() << " was too large. Reduce it and rerun." << Endl;
00853 Log() << " If the results still are not OK, modify the" << Endl;
00854 Log() << " model either by modifying the rule ensemble" << Endl;
00855 Log() << " or add/remove linear terms" << Endl;
00856 }