ROOT_528-00b_version: tmva/inc/RuleFitParams.h Source File

00001 // @(#)root/tmva $Id: RuleFitParams.h 33928 2010-06-15 16:19:31Z stelzer $
00002 // Author: Andreas Hoecker, Joerg Stelzer, Fredrik Tegenfeldt, Helge Voss
00003 
00004 /**********************************************************************************
00005  * Project: TMVA - a Root-integrated toolkit for multivariate data analysis       *
00006  * Package: TMVA                                                                  *
00007  * Class  : RuleFitParams                                                         *
00008  * Web    : http://tmva.sourceforge.net                                           *
00009  *                                                                                *
00010  * Description:                                                                   *
00011  *      A class doing the actual fitting of a linear model using rules as         *
00012  *      base functions.                                                           *
00013  *      Reference paper: 1.Gradient Directed Regularization                       *
00014  *                         Friedman, Popescu, 2004                                *
00015  *                       2.Predictive Learning with Rule Ensembles                *
00016  *                         Friedman, Popescu, 2005                                *
00017  *                                                                                *
00018  *                                                                                *
00019  * Authors (alphabetical):                                                        *
00020  *      Fredrik Tegenfeldt <Fredrik.Tegenfeldt@cern.ch> - Iowa State U., USA      *
00021  *      Helge Voss         <Helge.Voss@cern.ch>         - MPI-KP Heidelberg, Ger. *
00022  *                                                                                *
00023  * Copyright (c) 2005:                                                            *
00024  *      CERN, Switzerland                                                         * 
00025  *      Iowa State U.                                                             *
00026  *      MPI-K Heidelberg, Germany                                                 * 
00027  *                                                                                *
00028  * Redistribution and use in source and binary forms, with or without             *
00029  * modification, are permitted according to the terms listed in LICENSE           *
00030  * (http://tmva.sourceforge.net/LICENSE)                                          *
00031  **********************************************************************************/
00032 
00033 #ifndef ROOT_TMVA_RuleFitParams
00034 #define ROOT_TMVA_RuleFitParams
00035 
00036 #if ROOT_VERSION_CODE >= 364802
00037 #ifndef ROOT_TMathBase
00038 #include "TMathBase.h"
00039 #endif
00040 #else
00041 #ifndef ROOT_TMath
00042 #include "TMath.h"
00043 #endif
00044 #endif
00045 
00046 #ifndef ROOT_TMVA_Event
00047 #include "TMVA/Event.h"
00048 #endif
00049 
00050 class TTree;
00051 
00052 namespace TMVA {
00053 
00054    class RuleEnsemble;
00055    class MsgLogger;
00056    class RuleFit;
00057    class RuleFitParams {
00058 
00059    public:
00060 
00061       RuleFitParams();
00062       virtual ~RuleFitParams();
00063 
00064       void Init();
00065 
00066       // set message type
00067       void SetMsgType( EMsgType t );
00068 
00069       // set RuleFit ptr
00070       void SetRuleFit( RuleFit *rf )    { fRuleFit = rf; }
00071       //
00072       // GD path: set N(path steps)
00073       void SetGDNPathSteps( Int_t np )  { fGDNPathSteps = np; }
00074 
00075       // GD path: set path step size
00076       void SetGDPathStep( Double_t s )  { fGDPathStep = s; }
00077 
00078       // GD path: set tau search range
00079       void SetGDTauRange( Double_t t0, Double_t t1 )
00080       {
00081          fGDTauMin = (t0>1.0 ? 1.0:(t0<0.0 ? 0.0:t0));
00082          fGDTauMax = (t1>1.0 ? 1.0:(t1<0.0 ? 0.0:t1));
00083          if (fGDTauMax<fGDTauMin) fGDTauMax = fGDTauMin;
00084       }
00085 
00086       // GD path: set number of steps in tau search range
00087       void SetGDTauScan( UInt_t n )        { fGDTauScan = n; }
00088 
00089       // GD path: set tau
00090       void SetGDTau( Double_t t ) { fGDTau = t; }
00091 
00092 
00093       void SetGDErrScale( Double_t s ) { fGDErrScale = s; }
00094       void SetGDTauPrec( Double_t p )  { fGDTauPrec=p; CalcGDNTau(); fGDTauVec.resize(fGDNTau); }
00095 
00096       // return type such that +1 = signal and -1 = background
00097       Int_t Type( const Event * e ) const; // return (fRuleFit->GetMethodRuleFit()->DataInfo().IsSignal(e) ? 1:-1); }
00098       //
00099       UInt_t                            GetPathIdx1() const { return fPathIdx1; }
00100       UInt_t                            GetPathIdx2() const { return fPathIdx2; }
00101       UInt_t                            GetPerfIdx1() const { return fPerfIdx1; }
00102       UInt_t                            GetPerfIdx2() const { return fPerfIdx2; }
00103 
00104       // Loss function; Huber loss eq 33
00105       Double_t LossFunction( const Event& e ) const;
00106 
00107       // same but using evt idx (faster)
00108       Double_t LossFunction( UInt_t evtidx ) const;
00109       Double_t LossFunction( UInt_t evtidx, UInt_t itau ) const;
00110 
00111       // Empirical risk
00112       Double_t Risk(UInt_t ind1, UInt_t ind2, Double_t neff) const;
00113       Double_t Risk(UInt_t ind1, UInt_t ind2, Double_t neff, UInt_t itau) const;
00114 
00115       // Risk evaluation for fPathIdx and fPerfInd
00116       Double_t RiskPath() const { return Risk(fPathIdx1,fPathIdx2,fNEveEffPath); }
00117       Double_t RiskPerf() const { return Risk(fPerfIdx1,fPerfIdx2,fNEveEffPerf); }
00118       Double_t RiskPerf( UInt_t itau ) const { return Risk(fPerfIdx1,fPerfIdx2,fNEveEffPerf,itau); }
00119 
00120       // Risk evaluation for all tau
00121       UInt_t RiskPerfTst();
00122     
00123       // Penalty function; Lasso function (eq 8)
00124       Double_t Penalty() const;
00125 
00126       // initialize GD path
00127       void InitGD();
00128 
00129       // find best tau and return the number of scan steps used
00130       Int_t FindGDTau();
00131 
00132       // make path for binary classification (squared-error ramp, sect 6 in ref 1)
00133       void MakeGDPath();
00134 
00135    protected:
00136 
00137       // typedef of an Event const iterator
00138       typedef std::vector<TMVA::Event *>::const_iterator  EventItr;
00139 
00140       // init ntuple
00141       void InitNtuple();
00142 
00143       // calculate N(tau) in scan - limit to 100000.
00144       void CalcGDNTau()  { fGDNTau = static_cast<UInt_t>(1.0/fGDTauPrec)+1; if (fGDNTau>100000) fGDNTau=100000; }
00145 
00146       // fill ntuple with coefficient info
00147       void FillCoefficients();
00148 
00149       // estimate the optimum scoring function
00150       void CalcFStar();
00151 
00152       // estimate of binary error rate
00153       Double_t ErrorRateBin();
00154 
00155       // estimate of scale average error rate
00156       Double_t ErrorRateReg();
00157 
00158       // estimate 1-area under ROC
00159       Double_t ErrorRateRocRaw( std::vector<Double_t> & sFsig, std::vector<Double_t> & sFbkg );
00160       Double_t ErrorRateRoc();
00161       void     ErrorRateRocTst();
00162 
00163       // estimate optimism
00164       Double_t Optimism();
00165 
00166       // make gradient vector (eq 44 in ref 1)
00167       void MakeGradientVector();
00168 
00169       // Calculate the direction in parameter space (eq 25, ref 1) and update coeffs (eq 22, ref 1)
00170       void UpdateCoefficients();
00171 
00172       // calculate average of responses of F
00173       Double_t CalcAverageResponse();
00174       Double_t CalcAverageResponseOLD();
00175 
00176       // calculate average of true response (initial estimate of a0)
00177       Double_t CalcAverageTruth();
00178 
00179       // calculate the average of each variable over the range
00180       void EvaluateAverage(UInt_t ind1, UInt_t ind2,
00181                            std::vector<Double_t> &avsel,
00182                            std::vector<Double_t> &avrul);
00183 
00184       // evaluate using fPathIdx1,2
00185       void EvaluateAveragePath() { EvaluateAverage( fPathIdx1, fPathIdx2, fAverageSelectorPath, fAverageRulePath ); }
00186 
00187       // evaluate using fPerfIdx1,2
00188       void EvaluateAveragePerf() { EvaluateAverage( fPerfIdx1, fPerfIdx2, fAverageSelectorPerf, fAverageRulePerf ); }
00189 
00190       // the same as above but for the various tau
00191       void MakeTstGradientVector();
00192       void UpdateTstCoefficients();
00193       void CalcTstAverageResponse();
00194 
00195 
00196       RuleFit             * fRuleFit;      // rule fit
00197       RuleEnsemble        * fRuleEnsemble; // rule ensemble
00198       //
00199       UInt_t                fNRules;       // number of rules
00200       UInt_t                fNLinear;      // number of linear terms
00201       //
00202       // Event indecis for path/validation - TODO: should let the user decide
00203       // Now it is just a simple one-fold cross validation.
00204       //
00205       UInt_t                fPathIdx1;       // first event index for path search
00206       UInt_t                fPathIdx2;       // last event index for path search
00207       UInt_t                fPerfIdx1;       // first event index for performance evaluation
00208       UInt_t                fPerfIdx2;       // last event index for performance evaluation
00209       Double_t              fNEveEffPath;    // sum of weights for Path events
00210       Double_t              fNEveEffPerf;    // idem for Perf events
00211 
00212       std::vector<Double_t> fAverageSelectorPath; // average of each variable over the range fPathIdx1,2
00213       std::vector<Double_t> fAverageRulePath;     // average of each rule, same range
00214       std::vector<Double_t> fAverageSelectorPerf; // average of each variable over the range fPerfIdx1,2
00215       std::vector<Double_t> fAverageRulePerf;     // average of each rule, same range
00216 
00217       std::vector<Double_t> fGradVec;        // gradient vector - dimension = number of rules in ensemble
00218       std::vector<Double_t> fGradVecLin;     // gradient vector - dimension = number of variables
00219 
00220       std::vector< std::vector<Double_t> > fGradVecTst;    // gradient vector - one per tau
00221       std::vector< std::vector<Double_t> > fGradVecLinTst; // gradient vector, linear terms - one per tau
00222       //
00223       std::vector<Double_t> fGDErrTst;     // error rates per tau
00224       std::vector<Char_t>   fGDErrTstOK;   // error rate is sufficiently low <--- stores boolean
00225       std::vector< std::vector<Double_t> > fGDCoefTst;    // rule coeffs - one per tau
00226       std::vector< std::vector<Double_t> > fGDCoefLinTst; // linear coeffs - one per tau
00227       std::vector<Double_t> fGDOfsTst;       // offset per tau
00228       std::vector< Double_t > fGDTauVec;     // the tau's
00229       UInt_t                fGDNTauTstOK;    // number of tau in the test-phase that are ok
00230       UInt_t                fGDNTau;         // number of tau-paths - calculated in SetGDTauPrec
00231       Double_t              fGDTauPrec;      // precision in tau
00232       UInt_t                fGDTauScan;      // number scan for tau-paths
00233       Double_t              fGDTauMin;       // min threshold parameter (tau in eq 26, ref 1)
00234       Double_t              fGDTauMax;       // max threshold parameter (tau in eq 26, ref 1)
00235       Double_t              fGDTau;          // selected threshold parameter (tau in eq 26, ref 1)
00236       Double_t              fGDPathStep;     // step size along path (delta nu in eq 22, ref 1)
00237       Int_t                 fGDNPathSteps;   // number of path steps
00238       Double_t              fGDErrScale;     // stop scan at error = scale*errmin
00239       //
00240       Double_t              fAverageTruth;   // average truth, ie sum(y)/N, y=+-1
00241       //
00242       std::vector<Double_t> fFstar;          // vector of F*() - filled in CalcFStar()
00243       Double_t              fFstarMedian;    // median value of F*() using 
00244       //
00245       TTree                *fGDNtuple;       // Gradient path ntuple, contains params for each step along the path
00246       Double_t              fNTRisk;         // GD path: risk
00247       Double_t              fNTErrorRate;    // GD path: error rate (or performance)
00248       Double_t              fNTNuval;        // GD path: value of nu
00249       Double_t              fNTCoefRad;      // GD path: 'radius' of all rulecoeffs
00250       Double_t              fNTOffset;       // GD path: model offset
00251       Double_t             *fNTCoeff;        // GD path: rule coefficients
00252       Double_t             *fNTLinCoeff;     // GD path: linear coefficients
00253 
00254       Double_t              fsigave;         // Sigma of current signal score function F(sig)
00255       Double_t              fsigrms;         // Rms of F(sig)
00256       Double_t              fbkgave;         // Average of F(bkg)
00257       Double_t              fbkgrms;         // Rms of F(bkg)
00258 
00259    private:
00260 
00261       mutable MsgLogger*    fLogger;         //! message logger
00262       MsgLogger& Log() const { return *fLogger; }                       
00263 
00264    };
00265 
00266    // --------------------------------------------------------
00267 
00268    class AbsValue {
00269 
00270    public:
00271 
00272       Bool_t operator()( Double_t first, Double_t second ) const { return TMath::Abs(first) < TMath::Abs(second); }
00273    };
00274 }
00275 
00276 
00277 #endif