ROOT_528-00b_version: hist/hist/src/TBinomialEfficiencyFitter.cxx Source File

00001 // @(#)root/hist:$Id: TBinomialEfficiencyFitter.cxx 36026 2010-10-01 16:17:09Z moneta $
00002 // Author: Frank Filthaut, Rene Brun   30/05/2007
00003 
00004 /*************************************************************************
00005  * Copyright (C) 1995-2007, Rene Brun and Fons Rademakers.               *
00006  * All rights reserved.                                                  *
00007  *                                                                       *
00008  * For the licensing terms see $ROOTSYS/LICENSE.                         *
00009  * For the list of contributors see $ROOTSYS/README/CREDITS.             *
00010  *************************************************************************/
00011 
00012 //////////////////////////////////////////////////////////////////////////
00013 //
00014 // TBinomialEfficiencyFitter
00015 //
00016 // Binomial fitter for the division of two histograms.
00017 // Use when you need to calculate a selection's efficiency from two histograms,
00018 // one containing all entries, and one containing the subset of these entries
00019 // that pass the selection, and when you have a parametrization available for
00020 // the efficiency as a function of the variable(s) under consideration.
00021 //
00022 // A very common problem when estimating efficiencies is that of error estimation:
00023 // when no other information is available than the total number of events N and
00024 // the selected number n, the best estimate for the selection efficiency p is n/N.
00025 // Standard binomial statistics dictates that the uncertainty (this presupposes
00026 // sufficiently high statistics that an approximation by a normal distribution is
00027 // reasonable) on p, given N, is
00028 //Begin_Latex
00029 //   #sqrt{#frac{p(1-p)}{N}}.
00030 //End_Latex
00031 // However, when p is estimated as n/N, fluctuations from the true p to its
00032 // estimate become important, especially for low numbers of events, and giving
00033 // rise to biased results.
00034 //
00035 // When fitting a parametrized efficiency, these problems can largely be overcome,
00036 // as a hypothesized true efficiency is available by construction. Even so, simply
00037 // using the corresponding uncertainty still presupposes that Gaussian errors
00038 // yields a reasonable approximation. When using, instead of binned efficiency
00039 // histograms, the original numerator and denominator histograms, a binned maximum
00040 // likelihood can be constructed as the product of bin-by-bin binomial probabilities
00041 // to select n out of N events. Assuming that a correct parametrization of the
00042 // efficiency is provided, this construction in general yields less biased results
00043 // (and is much less sensitive to binning details).
00044 //
00045 // A generic use of this method is given below (note that the method works for 2D
00046 // and 3D histograms as well):
00047 //
00048 // {
00049 //   TH1* denominator;              // denominator histogram
00050 //   TH1* numerator;                // corresponding numerator histogram
00051 //   TF1* eff;                      // efficiency parametrization
00052 //   ....                           // set step sizes and initial parameter
00053 //   ....                           //   values for the fit function
00054 //   ....                           // possibly also set ranges, see TF1::SetRange()
00055 //   TBinomialEfficiencyFitter* f = new TBinomialEfficiencyFitter(
00056 //                                      numerator, denominator);
00057 //   Int_t status = f->Fit(eff, "I");
00058 //   if (status == 0) {
00059 //      // if the fit was successful, display bin-by-bin efficiencies
00060 //      // as well as the result of the fit
00061 //      numerator->Sumw2();
00062 //      TH1* hEff = dynamic_cast<TH1*>(numerator->Clone("heff"));
00063 //      hEff->Divide(hEff, denominator, 1.0, 1.0, "B");
00064 //      hEff->Draw("E");
00065 //      eff->Draw("same");
00066 //   }
00067 // }
00068 //
00069 // Note that this method cannot be expected to yield reliable results when using
00070 // weighted histograms (because the likelihood computation will be incorrect).
00071 //////////////////////////////////////////////////////////////////////////
00072 
00073 #include "TBinomialEfficiencyFitter.h"
00074 
00075 #include "TMath.h"
00076 #include "TPluginManager.h"
00077 #include "TROOT.h"
00078 #include "TH1.h"
00079 #include "TF1.h"
00080 #include "TF2.h"
00081 #include "TF3.h"
00082 #include "TVirtualFitter.h"
00083 #include "TEnv.h"
00084 
00085 #include <limits>
00086 
00087 TVirtualFitter *TBinomialEfficiencyFitter::fgFitter = 0;
00088 
00089 const Double_t kDefaultEpsilon = 1E-12;
00090 
00091 ClassImp(TBinomialEfficiencyFitter)
00092 
00093 
00094 //______________________________________________________________________________
00095 TBinomialEfficiencyFitter::TBinomialEfficiencyFitter() {
00096    // default constructor
00097   
00098    fNumerator   = 0;
00099    fDenominator = 0;
00100    fFunction    = 0;
00101    fFitDone     = kFALSE;
00102    fAverage     = kFALSE;
00103    fRange       = kFALSE;
00104    fEpsilon     = kDefaultEpsilon;
00105 }
00106 
00107 //______________________________________________________________________________
00108 TBinomialEfficiencyFitter::TBinomialEfficiencyFitter(const TH1 *numerator, const TH1 *denominator) {
00109    // Constructor.
00110    //
00111    // Note that no objects are copied, so it is up to the user to ensure that the
00112    // histogram pointers remain valid.
00113    //
00114    // Both histograms need to be "consistent". This is not checked here, but in
00115    // TBinomialEfficiencyFitter::Fit().
00116 
00117    fEpsilon  = kDefaultEpsilon;
00118    fFunction = 0;
00119    Set(numerator,denominator);
00120 }
00121 
00122 //______________________________________________________________________________
00123 TBinomialEfficiencyFitter::~TBinomialEfficiencyFitter() {
00124    // destructor
00125    
00126    delete fgFitter; fgFitter = 0;
00127 }
00128 
00129 //______________________________________________________________________________
00130 void TBinomialEfficiencyFitter::Set(const TH1 *numerator, const TH1 *denominator)
00131 {
00132    // Initialize with a new set of inputs.
00133 
00134    fNumerator   = (TH1*)numerator;
00135    fDenominator = (TH1*)denominator;
00136 
00137    fFitDone     = kFALSE;
00138    fAverage     = kFALSE;
00139    fRange       = kFALSE;
00140 }
00141 
00142 //______________________________________________________________________________
00143 void TBinomialEfficiencyFitter::SetPrecision(Double_t epsilon)
00144 {
00145    // Set the required integration precision, see TF1::Integral()
00146    fEpsilon = epsilon;
00147 }
00148 
00149 //______________________________________________________________________________
00150 TVirtualFitter* TBinomialEfficiencyFitter::GetFitter()
00151 {
00152    // static: Provide access to the underlying fitter object.
00153    // This may be useful e.g. for the retrieval of additional information (such
00154    // as the output covariance matrix of the fit).
00155 
00156    return fgFitter;
00157 }
00158 
00159 //______________________________________________________________________________
00160 Int_t TBinomialEfficiencyFitter::Fit(TF1 *f1, Option_t* option) 
00161 {
00162    // Carry out the fit of the given function to the given histograms.
00163    //
00164    // If option "I" is used, the fit function will be averaged over the
00165    // bin (the default is to evaluate it simply at the bin center).
00166    //
00167    // If option "R" is used, the fit range will be taken from the fit
00168    // function (the default is to use the entire histogram).
00169    //
00170    // Note that all parameter values, limits, and step sizes are copied
00171    // from the input fit function f1 (so they should be set before calling
00172    // this method. This is particularly relevant for the step sizes, taken
00173    // to be the "error" set on input, as a null step size usually fixes the
00174    // corresponding parameter. That is protected against, but in such cases
00175    // an arbitrary starting step size will be used, and the reliability of
00176    // the fit should be questioned). If parameters are to be fixed, this
00177    // should be done by specifying non-null parameter limits, with lower
00178    // limits larger than upper limits.
00179    //
00180    // On output, f1 contains the fitted parameters and errors, as well as
00181    // the number of degrees of freedom, and the goodness-of-fit estimator
00182    // as given by S. Baker and R. Cousins, Nucl. Instr. Meth. A221 (1984) 437.
00183 
00184    TString opt = option;
00185    opt.ToUpper();
00186    fAverage  = opt.Contains("I");
00187    fRange    = opt.Contains("R");
00188    Bool_t verbose    = opt.Contains("V");
00189    if (!f1) return -1;
00190    fFunction = (TF1*)f1;
00191    Int_t i, npar;
00192    npar = f1->GetNpar();
00193    if (npar <= 0) {
00194       Error("Fit", "function %s has illegal number of parameters = %d", 
00195             f1->GetName(), npar);
00196       return -3;
00197    }
00198 
00199    // Check that function has same dimension as histogram
00200    if (!fNumerator || !fDenominator) {
00201       Error("Fit","No numerator or denominator histograms set");
00202       return -5;
00203    }
00204    if (f1->GetNdim() != fNumerator->GetDimension()) {
00205       Error("Fit","function %s dimension, %d, does not match histogram dimension, %d",
00206             f1->GetName(), f1->GetNdim(), fNumerator->GetDimension());
00207       return -4;
00208    }
00209    // Check that the numbers of bins for the histograms match
00210    if (fNumerator->GetNbinsX() != fDenominator->GetNbinsX() ||
00211        (f1->GetNdim() > 1 && fNumerator->GetNbinsY() != fDenominator->GetNbinsY()) ||
00212        (f1->GetNdim() > 2 && fNumerator->GetNbinsZ() != fDenominator->GetNbinsZ())) {
00213       Error("Fit", "numerator and denominator histograms do not have identical numbers of bins");
00214       return -6;
00215    }
00216 
00217    // initialize the fitter
00218 
00219    Int_t maxpar = npar;
00220    if (!fgFitter) {
00221       TPluginHandler *h;
00222       TString fitterName = TVirtualFitter::GetDefaultFitter(); 
00223       if (fitterName == "") 
00224          fitterName = gEnv->GetValue("Root.Fitter","Minuit");
00225       if ((h = gROOT->GetPluginManager()->FindHandler("TVirtualFitter", fitterName ))) {
00226          if (h->LoadPlugin() == -1)
00227             return 0;
00228          fgFitter = (TVirtualFitter*) h->ExecPlugin(1, maxpar);
00229       }
00230       if (!fgFitter) printf("ERROR fgFitter is NULL\n");
00231    }
00232 
00233    fgFitter->SetObjectFit(this);
00234    fgFitter->Clear();
00235    fgFitter->SetFCN(BinomialEfficiencyFitterFCN);
00236    Int_t nfixed = 0;
00237    Double_t al,bl,we,arglist[100];
00238    for (i = 0; i < npar; i++) {
00239       f1->GetParLimits(i,al,bl);
00240       if (al*bl != 0 && al >= bl) {
00241          al = bl = 0;
00242          arglist[nfixed] = i+1;
00243          nfixed++;
00244       }
00245       // assign an ARBITRARY starting error to ensure the parameter won't be fixed!
00246       we = f1->GetParError(i);
00247       if (we <= 0) we = 0.3*TMath::Abs(f1->GetParameter(i));
00248       if (we == 0) we = 0.01;
00249       fgFitter->SetParameter(i, f1->GetParName(i),
00250                                 f1->GetParameter(i),
00251                                 we, al,bl);
00252    }
00253    if (nfixed > 0) fgFitter->ExecuteCommand("FIX",arglist,nfixed); // Otto
00254 
00255    Double_t plist[2];
00256    plist[0] = 0.5;
00257    fgFitter->ExecuteCommand("SET ERRDEF",plist,1);
00258 
00259    if (verbose)   { 
00260       plist[0] = 3;
00261       fgFitter->ExecuteCommand("SET PRINT",plist,1);
00262    }
00263 
00264    // perform the actual fit
00265 
00266    fFitDone = kTRUE;
00267    plist[0] = TVirtualFitter::GetMaxIterations();
00268    plist[1] = TVirtualFitter::GetPrecision();
00269    Int_t result = fgFitter->ExecuteCommand("MINIMIZE",plist,2);
00270    
00271    //Store fit results in fitFunction
00272    char parName[50];
00273    Double_t par;
00274    Double_t eplus,eminus,eparab,globcc,werr;
00275    for (i = 0; i < npar; ++i) {
00276       fgFitter->GetParameter(i,parName, par,we,al,bl);
00277       fgFitter->GetErrors(i,eplus,eminus,eparab,globcc);
00278       if (eplus > 0 && eminus < 0) werr = 0.5*(eplus-eminus);
00279       else                         werr = we;
00280       f1->SetParameter(i,par);
00281       f1->SetParError(i,werr);
00282    }
00283    f1->SetNDF(f1->GetNumberFitPoints()-npar+nfixed);
00284    return result;
00285 }
00286 
00287 //______________________________________________________________________________
00288 void TBinomialEfficiencyFitter::ComputeFCN(Int_t& /*npar*/, Double_t* /* gin */,
00289                                            Double_t& f, Double_t* par, Int_t /*flag*/)
00290 {
00291    // Compute the likelihood.
00292 
00293    int nDim = fDenominator->GetDimension();
00294 
00295    int xlowbin  = fDenominator->GetXaxis()->GetFirst();
00296    int xhighbin = fDenominator->GetXaxis()->GetLast();
00297    int ylowbin = 0, yhighbin = 0, zlowbin = 0, zhighbin = 0;
00298    if (nDim > 1) {
00299       ylowbin  = fDenominator->GetYaxis()->GetFirst();
00300       yhighbin = fDenominator->GetYaxis()->GetLast();
00301       if (nDim > 2) {
00302          zlowbin  = fDenominator->GetZaxis()->GetFirst();
00303          zhighbin = fDenominator->GetZaxis()->GetLast();
00304       }
00305    }
00306 
00307    fFunction->SetParameters(par);
00308 
00309    if (fRange) {
00310       double xmin, xmax, ymin, ymax, zmin, zmax;
00311 
00312       // This way to ensure that a minimum range chosen exactly at a
00313       // bin boundary is far from elegant, but is hopefully adequate.
00314 
00315       if (nDim == 1) {
00316          fFunction->GetRange(xmin, xmax);
00317          xlowbin  = fDenominator->GetXaxis()->FindBin(xmin);
00318          xhighbin = fDenominator->GetXaxis()->FindBin(xmax);
00319       } else if (nDim == 2) {
00320          fFunction->GetRange(xmin, ymin, xmax, ymax);
00321          xlowbin  = fDenominator->GetXaxis()->FindBin(xmin);
00322          xhighbin = fDenominator->GetXaxis()->FindBin(xmax);
00323          ylowbin  = fDenominator->GetYaxis()->FindBin(ymin);
00324          yhighbin = fDenominator->GetYaxis()->FindBin(ymax);
00325       } else if (nDim == 3) {
00326          fFunction->GetRange(xmin, ymin, zmin, xmax, ymax, zmax);
00327          xlowbin  = fDenominator->GetXaxis()->FindBin(xmin);
00328          xhighbin = fDenominator->GetXaxis()->FindBin(xmax);
00329          ylowbin  = fDenominator->GetYaxis()->FindBin(ymin);
00330          yhighbin = fDenominator->GetYaxis()->FindBin(ymax);
00331          zlowbin  = fDenominator->GetZaxis()->FindBin(zmin);
00332          zhighbin = fDenominator->GetZaxis()->FindBin(zmax);
00333       }
00334    }
00335 
00336    // The coding below is perhaps somewhat awkward -- but it is done
00337    // so that 1D, 2D, and 3D cases can be covered in the same loops.
00338 
00339    f = 0.;
00340 
00341    Int_t npoints = 0;
00342    Double_t nmax = 0;
00343    for (int xbin = xlowbin; xbin <= xhighbin; ++xbin) {
00344 
00345       // compute the bin edges
00346       Double_t xlow = fDenominator->GetXaxis()->GetBinLowEdge(xbin);
00347       Double_t xup  = fDenominator->GetXaxis()->GetBinLowEdge(xbin+1);
00348 
00349       for (int ybin = ylowbin; ybin <= yhighbin; ++ybin) {
00350 
00351          // compute the bin edges (if applicable)
00352          Double_t ylow  = (nDim > 1) ? fDenominator->GetYaxis()->GetBinLowEdge(ybin) : 0;
00353          Double_t yup   = (nDim > 1) ? fDenominator->GetYaxis()->GetBinLowEdge(ybin+1) : 0;
00354 
00355          for (int zbin = zlowbin; zbin <= zhighbin; ++zbin) {
00356 
00357             // compute the bin edges (if applicable)
00358             Double_t zlow  = (nDim > 2) ? fDenominator->GetZaxis()->GetBinLowEdge(zbin) : 0;
00359             Double_t zup   = (nDim > 2) ? fDenominator->GetZaxis()->GetBinLowEdge(zbin+1) : 0;
00360 
00361             int bin = fDenominator->GetBin(xbin, ybin, zbin);
00362             Double_t nDen = fDenominator->GetBinContent(bin);
00363             Double_t nNum = fNumerator->GetBinContent(bin);
00364 
00365             // count maximum value to use in the likelihood for inf
00366             // i.e. a number much larger than the other terms  
00367             if (nDen> nmax) nmax = nDen; 
00368             if (nDen <= 0.) continue;
00369             npoints++;
00370       
00371             // mu is the average of the function over the bin OR
00372             // the function evaluated at the bin centre
00373             // As yet, there is nothing to prevent mu from being 
00374             // outside the range <0,1> !!
00375 
00376             Double_t mu = 0;
00377             switch (nDim) {
00378                case 1:
00379                   mu = (fAverage) ?
00380                      fFunction->Integral(xlow, xup, (Double_t*)0, fEpsilon) 
00381                         / (xup-xlow) :
00382                      fFunction->Eval(fDenominator->GetBinCenter(bin));
00383                   break;
00384                case 2:
00385                   {
00386                      mu = (fAverage) ?
00387                      fFunction->Integral(xlow, xup, ylow, yup, fEpsilon) 
00388                         / ((xup-xlow)*(yup-ylow)) :
00389                      fFunction->Eval(fDenominator->GetXaxis()->GetBinCenter(xbin),
00390                      fDenominator->GetYaxis()->GetBinCenter(ybin));
00391                   }
00392                   break;
00393                case 3:
00394                   {
00395                      mu = (fAverage) ?
00396                         fFunction->Integral(xlow, xup, ylow, yup, zlow, zup, fEpsilon)
00397                            / ((xup-xlow)*(yup-ylow)*(zup-zlow)) :
00398                         fFunction->Eval(fDenominator->GetXaxis()->GetBinCenter(xbin),
00399                                  fDenominator->GetYaxis()->GetBinCenter(ybin),
00400                                  fDenominator->GetZaxis()->GetBinCenter(zbin));
00401                   }
00402             }
00403 
00404             // binomial formula (forgetting about the factorials)
00405             if (nNum != 0.) {
00406                if (mu > 0.)
00407                   f -= nNum * TMath::Log(mu*nDen/nNum);
00408                else
00409                   f -= nmax * -1E30; // crossing our fingers
00410             }
00411             if (nDen - nNum != 0.) {
00412                if (1. - mu > 0.)
00413                   f -= (nDen - nNum) * TMath::Log((1. - mu)*nDen/(nDen-nNum));
00414                else 
00415                   f -= nmax * -1E30; // crossing our fingers
00416             }
00417          }
00418       }
00419    }
00420 
00421    fFunction->SetNumberFitPoints(npoints);
00422    fFunction->SetChisquare(2.*f);    // store goodness of fit (Baker&Cousins)
00423 }
00424 
00425 //______________________________________________________________________________
00426 void BinomialEfficiencyFitterFCN(Int_t& npar, Double_t* gin, Double_t& f, 
00427                                  Double_t* par, Int_t flag)
00428 {
00429    // Function called by the minimisation package. The actual functionality is 
00430    // passed on to the TBinomialEfficiencyFitter::ComputeFCN() member function.
00431 
00432    TBinomialEfficiencyFitter* fitter = dynamic_cast<TBinomialEfficiencyFitter*>(TBinomialEfficiencyFitter::GetFitter()->GetObjectFit());
00433    if (!fitter) {
00434       Error("binomialFCN","Invalid fit object encountered!");
00435       return;
00436    }
00437    fitter->ComputeFCN(npar, gin, f, par, flag);
00438 }