ROOT_528-00b_version: hist/hist/src/TLimit.cxx Source File

00001 // @(#)root/hist:$Id: TLimit.cxx 26221 2008-11-17 08:10:26Z brun $
00002 // Author: Christophe.Delaere@cern.ch   21/08/2002
00003 
00004 ///////////////////////////////////////////////////////////////////////////
00005 //
00006 // TLimit
00007 //
00008 // Class to compute 95% CL limits
00009 //
00010 // adapted from the mclimit code from Tom Junk (CLs method)
00011 // see http://root.cern.ch/root/doc/TomJunk.pdf
00012 // see http://cern.ch/thomasj/searchlimits/ecl.html
00013 // see: Tom Junk,NIM A434, p. 435-443, 1999
00014 //
00015 // see also the following interesting references:
00016 //  Alex Read, "Presentation of search results: the CLs technique"
00017 //  Journal of Physics G: Nucl. Part. Phys. 28 2693-2704 (2002).
00018 //  http://www.iop.org/EJ/abstract/0954-3899/28/10/313/
00019 //
00020 // A nice article is also available in the CERN yellow report with the proceeding
00021 // of the 2000 CERN workshop on confidence intervals.
00022 //
00023 //  Alex Read, "Modified Frequentist Analysis of Search Results (The CLs Method)"
00024 //  CERN 2000-005 (30 May 2000)
00025 //
00026 // see note about: "Should I use TRolke, TFeldmanCousins, TLimit?"
00027 //  in the TRolke class description.
00028 //
00029 ///////////////////////////////////////////////////////////////////////////
00030 
00031 #include "TLimit.h"
00032 #include "TArrayD.h"
00033 #include "TVectorD.h"
00034 #include "TOrdCollection.h"
00035 #include "TConfidenceLevel.h"
00036 #include "TLimitDataSource.h"
00037 #include "TRandom3.h"
00038 #include "TH1.h"
00039 #include "TObjArray.h"
00040 #include "TMath.h"
00041 #include "TIterator.h"
00042 #include "TObjString.h"
00043 #include "TClassTable.h"
00044 #include "Riostream.h"
00045 
00046 ClassImp(TLimit)
00047 
00048 TArrayD *TLimit::fgTable = new TArrayD(0);
00049 TOrdCollection *TLimit::fgSystNames = new TOrdCollection();
00050 
00051 TConfidenceLevel *TLimit::ComputeLimit(TLimitDataSource * data,
00052                                        Int_t nmc, bool stat,
00053                                        TRandom * generator)
00054 {
00055    // class TLimit
00056    // ------------
00057    // Algorithm to compute 95% C.L. limits using the Likelihood ratio
00058    // semi-bayesian method.
00059    // It takes signal, background and data histograms wrapped in a
00060    // TLimitDataSource as input and runs a set of Monte Carlo experiments in
00061    // order to compute the limits. If needed, inputs are fluctuated according
00062    // to systematics. The output is a TConfidenceLevel.
00063    //
00064    // class TLimitDataSource
00065    // ----------------------
00066    //
00067    // Takes the signal, background and data histograms as well as different
00068    // systematics sources to form the TLimit input.
00069    //
00070    //  class TConfidenceLevel
00071    //  ----------------------
00072    //
00073    // Final result of the TLimit algorithm. It is created just after the
00074    // time-consuming part and can be stored in a TFile for further processing.
00075    // It contains light methods to return CLs, CLb and other interesting
00076    // quantities.
00077    //
00078    // The actual algorithm...
00079    // From an input (TLimitDataSource) it produces an output TConfidenceLevel.
00080    // For this, nmc Monte Carlo experiments are performed.
00081    // As usual, the larger this number, the longer the compute time,
00082    // but the better the result.
00083    //Begin_Html
00084    /*
00085    <FONT SIZE=+0>
00086    <p>Supposing that there is a plotfile.root file containing 3 histograms
00087            (signal, background and data), you can imagine doing things like:</p>
00088    <p>
00089    <BLOCKQUOTE><PRE>
00090     TFile* infile=new TFile("plotfile.root","READ");
00091     infile->cd();
00092     TH1* sh=(TH1*)infile->Get("signal");
00093     TH1* bh=(TH1*)infile->Get("background");
00094     TH1* dh=(TH1*)infile->Get("data");
00095     TLimitDataSource* mydatasource = new TLimitDataSource(sh,bh,dh);
00096     TConfidenceLevel *myconfidence = TLimit::ComputeLimit(mydatasource,50000);
00097     cout &lt&lt "  CLs    : " &lt&lt myconfidence->CLs()  &lt&lt endl;
00098     cout &lt&lt "  CLsb   : " &lt&lt myconfidence->CLsb() &lt&lt endl;
00099     cout &lt&lt "  CLb    : " &lt&lt myconfidence->CLb()  &lt&lt endl;
00100     cout &lt&lt "&lt CLs &gt  : " &lt&lt myconfidence->GetExpectedCLs_b()  &lt&lt endl;
00101     cout &lt&lt "&lt CLsb &gt : " &lt&lt myconfidence->GetExpectedCLsb_b() &lt&lt endl;
00102     cout &lt&lt "&lt CLb &gt  : " &lt&lt myconfidence->GetExpectedCLb_b()  &lt&lt endl;
00103     delete myconfidence;
00104     delete mydatasource;
00105     infile->Close();
00106    </PRE></BLOCKQUOTE></p>
00107    <p></p>
00108    <p>More informations can still be found on
00109    <a HREF="http://cern.ch/aleph-proj-alphapp/doc/tlimit.html">this</a> page.</p>
00110    </FONT>
00111    */
00112    //End_Html
00113 
00114    // The final object returned...
00115    TConfidenceLevel *result = new TConfidenceLevel(nmc);
00116    // The random generator used...
00117    TRandom *myrandom = generator ? generator : new TRandom3;
00118    // Compute some total quantities on all the channels
00119    Int_t maxbins = 0;
00120    Double_t nsig = 0;
00121    Double_t nbg  = 0;
00122    Int_t ncand   = 0;
00123    Int_t i;
00124    for (i = 0; i <= data->GetSignal()->GetLast(); i++) {
00125       maxbins = (((TH1 *) (data->GetSignal()->At(i)))->GetNbinsX() + 2) > maxbins ?
00126                  (((TH1 *) (data->GetSignal()->At(i)))->GetNbinsX() + 2) : maxbins;
00127       nsig   +=  ((TH1 *) (data->GetSignal()->At(i)))->Integral();
00128       nbg    +=  ((TH1 *) (data->GetBackground()->At(i)))->Integral();
00129       ncand  +=  (Int_t) ((TH1 *) (data->GetCandidates()->At(i)))->Integral();
00130    }
00131    result->SetBtot(nbg);
00132    result->SetStot(nsig);
00133    result->SetDtot(ncand);
00134    Double_t buffer = 0;
00135    fgTable->Set(maxbins * (data->GetSignal()->GetLast() + 1));
00136    for (Int_t channel = 0; channel <= data->GetSignal()->GetLast(); channel++)
00137       for (Int_t bin = 0;
00138            bin <= ((TH1 *) (data->GetSignal()->At(channel)))->GetNbinsX()+1;
00139            bin++) {
00140          Double_t s = (Double_t) ((TH1 *) (data->GetSignal()->At(channel)))->GetBinContent(bin);
00141          Double_t b = (Double_t) ((TH1 *) (data->GetBackground()->At(channel)))->GetBinContent(bin);
00142          Double_t d = (Double_t) ((TH1 *) (data->GetCandidates()->At(channel)))->GetBinContent(bin);
00143          // Compute the value of the "-2lnQ" for the actual data
00144          if ((b == 0) && (s > 0)) {
00145             cout << "WARNING: Ignoring bin " << bin << " of channel "
00146                  << channel << " which has s=" << s << " but b=" << b << endl;
00147             cout << "         Maybe the MC statistic has to be improved..." << endl;
00148          }
00149          if ((s > 0) && (b > 0))
00150             buffer += LogLikelihood(s, b, b, d);
00151          // precompute the log(1+s/b)'s in an array to speed up computation
00152          // background-free bins are set to have a maximum t.s. value
00153          // for protection (corresponding to s/b of about 5E8)
00154          if ((s > 0) && (b > 0))
00155             fgTable->AddAt(LogLikelihood(s, b, b, 1), (channel * maxbins) + bin);
00156          else if ((s > 0) && (b == 0))
00157             fgTable->AddAt(20, (channel * maxbins) + bin);
00158       }
00159    result->SetTSD(buffer);
00160    // accumulate MC experiments.  Hold the test statistic function fixed, but
00161    // fluctuate s and b within syst. errors for computing probabilities of
00162    // having that outcome.  (Alex Read's prescription -- errors are on the ensemble,
00163    // not on the observed test statistic.  This technique does not split outcomes.)
00164    // keep the tstats as sum log(1+s/b). convert to -2lnQ when preparing the results
00165    // (reason -- like to keep the < signs right)
00166    Double_t *tss = new Double_t[nmc];
00167    Double_t *tsb = new Double_t[nmc];
00168    Double_t *lrs = new Double_t[nmc];
00169    Double_t *lrb = new Double_t[nmc];
00170    TLimitDataSource* tmp_src = (TLimitDataSource*)(data->Clone());
00171    TLimitDataSource* tmp_src2 = (TLimitDataSource*)(data->Clone());
00172    for (i = 0; i < nmc; i++) {
00173       tss[i] = 0;
00174       tsb[i] = 0;
00175       lrs[i] = 0;
00176       lrb[i] = 0;
00177       // fluctuate signal and background
00178       TLimitDataSource*  fluctuated = Fluctuate(data, tmp_src, !i, myrandom, stat) ? tmp_src : data;
00179       // make an independent set of fluctuations for reweighting pseudoexperiments
00180       // it turns out using the same fluctuated ones for numerator and denominator
00181       // is biased.
00182       TLimitDataSource*  fluctuated2 = Fluctuate(data, tmp_src2, false, myrandom, stat) ? tmp_src2 : data;
00183 
00184       for (Int_t channel = 0;
00185            channel <= fluctuated->GetSignal()->GetLast(); channel++) {
00186          for (Int_t bin = 0;
00187               bin <=((TH1 *) (fluctuated->GetSignal()->At(channel)))->GetNbinsX()+1;
00188               bin++) {
00189             if ((Double_t) ((TH1 *) (fluctuated->GetSignal()->At(channel)))->GetBinContent(bin) != 0) {
00190                // s+b hypothesis
00191                Double_t rate = (Double_t) ((TH1 *) (fluctuated->GetSignal()->At(channel)))->GetBinContent(bin) +
00192                                (Double_t) ((TH1 *) (fluctuated->GetBackground()->At(channel)))->GetBinContent(bin);
00193                Double_t rand = myrandom->Poisson(rate);
00194                tss[i] += rand * fgTable->At((channel * maxbins) + bin);
00195                Double_t s = (Double_t) ((TH1 *) (fluctuated->GetSignal()->At(channel)))->GetBinContent(bin);
00196                Double_t s2= (Double_t) ((TH1 *) (fluctuated2->GetSignal()->At(channel)))->GetBinContent(bin);
00197                Double_t b = (Double_t) ((TH1 *) (fluctuated->GetBackground()->At(channel)))->GetBinContent(bin);
00198                Double_t b2= (Double_t) ((TH1 *) (fluctuated2->GetBackground()->At(channel)))->GetBinContent(bin);
00199                if ((s > 0) && (b2 > 0))
00200                   lrs[i] += LogLikelihood(s, b, b2, rand) - s - b + b2;
00201                else if ((s > 0) && (b2 == 0))
00202                   lrs[i] += 20 * rand - s;
00203                // b hypothesis
00204                rate = (Double_t) ((TH1 *) (fluctuated->GetBackground()->At(channel)))->GetBinContent(bin);
00205                rand = myrandom->Poisson(rate);
00206                tsb[i] += rand * fgTable->At((channel * maxbins) + bin);
00207                if ((s2 > 0) && (b > 0))
00208                   lrb[i] += LogLikelihood(s2, b2, b, rand) - s2 - b2 + b;
00209                else if ((s > 0) && (b == 0))
00210                   lrb[i] += 20 * rand - s;
00211             }
00212          }
00213       }
00214       lrs[i] = TMath::Exp(lrs[i] < 710 ? lrs[i] : 710);
00215       lrb[i] = TMath::Exp(lrb[i] < 710 ? lrb[i] : 710);
00216    }
00217    delete tmp_src;
00218    delete tmp_src2;
00219    // lrs and lrb are the LR's (no logs) = prob(s+b)/prob(b) for
00220    // that choice of s and b within syst. errors in the ensemble.  These are
00221    // the MC experiment weights for relating the s+b and b PDF's of the unsmeared
00222    // test statistic (in which cas one can use another test statistic if one likes).
00223 
00224    // Now produce the output object.
00225    // The final quantities are computed on-demand form the arrays tss, tsb, lrs and lrb.
00226    result->SetTSS(tss);
00227    result->SetTSB(tsb);
00228    result->SetLRS(lrs);
00229    result->SetLRB(lrb);
00230    if (!generator)
00231       delete myrandom;
00232    return result;
00233 }
00234 
00235 bool TLimit::Fluctuate(TLimitDataSource * input, TLimitDataSource * output,
00236                        bool init, TRandom * generator, bool stat)
00237 {
00238    // initialisation: create a sorted list of all the names of systematics
00239    if (init) {
00240       // create a "map" with the systematics names
00241       TIterator *errornames = input->GetErrorNames()->MakeIterator();
00242       TObjArray *listofnames = 0;
00243       delete fgSystNames;
00244       fgSystNames = new TOrdCollection();
00245       while ((listofnames = ((TObjArray *) errornames->Next()))) {
00246          TObjString *name = 0;
00247          TIterator *loniter = listofnames->MakeIterator();
00248          while ((name = (TObjString *) (loniter->Next())))
00249             if ((fgSystNames->IndexOf(name)) < 0)
00250                fgSystNames->AddLast(name);
00251       }
00252       fgSystNames->Sort();
00253    }
00254    // if the output is not given, create it from the input
00255    if (!output)
00256    output = (TLimitDataSource*)(input->Clone());
00257    // if there are no systematics, just returns the input as "fluctuated" output
00258    if ((fgSystNames->GetSize() <= 0)&&(!stat)) {
00259       return 0;
00260    }
00261    // if there are only stat, just fluctuate stats.
00262    if (fgSystNames->GetSize() <= 0) {
00263       output->SetOwner();
00264       for (Int_t channel = 0; channel <= input->GetSignal()->GetLast(); channel++) {
00265          TH1 *newsignal = (TH1*)(output->GetSignal()->At(channel));
00266          TH1 *oldsignal = (TH1*)(input->GetSignal()->At(channel));
00267          if(stat)
00268             for(int i=1; i<=newsignal->GetNbinsX(); i++) {
00269                newsignal->SetBinContent(i,oldsignal->GetBinContent(i)+generator->Gaus(0,oldsignal->GetBinError(i)));
00270             }
00271             newsignal->SetDirectory(0);
00272             TH1 *newbackground = (TH1*)(output->GetBackground()->At(channel));
00273             TH1 *oldbackground = (TH1*)(input->GetBackground()->At(channel));
00274             if(stat)
00275                for(int i=1; i<=newbackground->GetNbinsX(); i++)
00276                   newbackground->SetBinContent(i,oldbackground->GetBinContent(i)+generator->Gaus(0,oldbackground->GetBinError(i)));
00277             newbackground->SetDirectory(0);
00278       }
00279       return 1;
00280    }
00281    // Find a choice for the random variation and
00282    // re-toss all random numbers if any background or signal
00283    // goes negative.  (background = 0 is bad too, so put a little protection
00284    // around it -- must have at least 10% of the bg estimate).
00285    Bool_t retoss   = kTRUE;
00286    Double_t *serrf = new Double_t[(input->GetSignal()->GetLast()) + 1];
00287    Double_t *berrf = new Double_t[(input->GetSignal()->GetLast()) + 1];
00288    do {
00289       Double_t *toss = new Double_t[fgSystNames->GetSize()];
00290       for (Int_t i = 0; i < fgSystNames->GetSize(); i++)
00291          toss[i] = generator->Gaus(0, 1);
00292       retoss = kFALSE;
00293       for (Int_t channel = 0;
00294            channel <= input->GetSignal()->GetLast();
00295            channel++) {
00296          serrf[channel] = 0;
00297          berrf[channel] = 0;
00298          for (Int_t bin = 0;
00299               bin <((TVectorD *) (input->GetErrorOnSignal()->At(channel)))->GetNrows();
00300               bin++) {
00301             serrf[channel] += ((TVectorD *) (input->GetErrorOnSignal()->At(channel)))->operator[](bin) *
00302                 toss[fgSystNames->BinarySearch((TObjString*) (((TObjArray *) (input->GetErrorNames()->At(channel)))->At(bin)))];
00303             berrf[channel] += ((TVectorD *) (input->GetErrorOnBackground()->At(channel)))->operator[](bin) *
00304                 toss[fgSystNames->BinarySearch((TObjString*) (((TObjArray *) (input->GetErrorNames()->At(channel)))->At(bin)))];
00305          }
00306          if ((serrf[channel] < -1.0) || (berrf[channel] < -0.9)) {
00307             retoss = kTRUE;
00308             continue;
00309          }
00310       }
00311       delete[]toss;
00312    } while (retoss);
00313    // adjust the fluctuated signal and background counts with a legal set
00314    // of random fluctuations above.
00315    output->SetOwner();
00316    for (Int_t channel = 0; channel <= input->GetSignal()->GetLast();
00317         channel++) {
00318       TH1 *newsignal = (TH1*)(output->GetSignal()->At(channel));
00319       TH1 *oldsignal = (TH1*)(input->GetSignal()->At(channel));
00320       if(stat)
00321          for(int i=1; i<=newsignal->GetNbinsX(); i++)
00322             newsignal->SetBinContent(i,oldsignal->GetBinContent(i)+generator->Gaus(0,oldsignal->GetBinError(i)));
00323       else
00324          for(int i=1; i<=newsignal->GetNbinsX(); i++)
00325             newsignal->SetBinContent(i,oldsignal->GetBinContent(i));
00326       newsignal->Scale(1 + serrf[channel]);
00327       newsignal->SetDirectory(0);
00328       TH1 *newbackground = (TH1*)(output->GetBackground()->At(channel));
00329       TH1 *oldbackground = (TH1*)(input->GetBackground()->At(channel));
00330       if(stat)
00331          for(int i=1; i<=newbackground->GetNbinsX(); i++)
00332             newbackground->SetBinContent(i,oldbackground->GetBinContent(i)+generator->Gaus(0,oldbackground->GetBinError(i)));
00333       else
00334          for(int i=1; i<=newbackground->GetNbinsX(); i++)
00335             newbackground->SetBinContent(i,oldbackground->GetBinContent(i));
00336       newbackground->Scale(1 + berrf[channel]);
00337       newbackground->SetDirectory(0);
00338    }
00339    delete[] serrf;
00340    delete[] berrf;
00341    return 1;
00342 }
00343 
00344 TConfidenceLevel *TLimit::ComputeLimit(TH1* s, TH1* b, TH1* d,
00345                                        Int_t nmc, bool stat,
00346                                        TRandom * generator)
00347 {
00348    // Compute limit.
00349 
00350    TLimitDataSource* lds = new TLimitDataSource(s,b,d);
00351    TConfidenceLevel* out = ComputeLimit(lds,nmc,stat,generator);
00352    delete lds;
00353    return out;
00354 }
00355 
00356 TConfidenceLevel *TLimit::ComputeLimit(TH1* s, TH1* b, TH1* d,
00357                                        TVectorD* se, TVectorD* be, TObjArray* l,
00358                                        Int_t nmc, bool stat,
00359                                        TRandom * generator)
00360 {
00361    // Compute limit.
00362 
00363    TLimitDataSource* lds = new TLimitDataSource(s,b,d,se,be,l);
00364    TConfidenceLevel* out = ComputeLimit(lds,nmc,stat,generator);
00365    delete lds;
00366    return out;
00367 }
00368 
00369 TConfidenceLevel *TLimit::ComputeLimit(Double_t s, Double_t b, Int_t d,
00370                                        Int_t nmc,
00371                                        bool stat,
00372                                        TRandom * generator)
00373 {
00374    // Compute limit.
00375 
00376    TH1D* sh = new TH1D("__sh","__sh",1,0,2);
00377    sh->Fill(1,s);
00378    TH1D* bh = new TH1D("__bh","__bh",1,0,2);
00379    bh->Fill(1,b);
00380    TH1D* dh = new TH1D("__dh","__dh",1,0,2);
00381    dh->Fill(1,d);
00382    TLimitDataSource* lds = new TLimitDataSource(sh,bh,dh);
00383    TConfidenceLevel* out = ComputeLimit(lds,nmc,stat,generator);
00384    delete lds;
00385    delete sh;
00386    delete bh;
00387    delete dh;
00388    return out;
00389 }
00390 
00391 TConfidenceLevel *TLimit::ComputeLimit(Double_t s, Double_t b, Int_t d,
00392                                        TVectorD* se, TVectorD* be, TObjArray* l,
00393                                        Int_t nmc,
00394                                        bool stat,
00395                                        TRandom * generator)
00396 {
00397    // Compute limit.
00398 
00399    TH1D* sh = new TH1D("__sh","__sh",1,0,2);
00400    sh->Fill(1,s);
00401    TH1D* bh = new TH1D("__bh","__bh",1,0,2);
00402    bh->Fill(1,b);
00403    TH1D* dh = new TH1D("__dh","__dh",1,0,2);
00404    dh->Fill(1,d);
00405    TLimitDataSource* lds = new TLimitDataSource(sh,bh,dh,se,be,l);
00406    TConfidenceLevel* out = ComputeLimit(lds,nmc,stat,generator);
00407    delete lds;
00408    delete sh;
00409    delete bh;
00410    delete dh;
00411    return out;
00412 }
00413 
00414 Double_t TLimit::LogLikelihood(Double_t s, Double_t b, Double_t b2, Double_t d)
00415 {
00416    // Compute LogLikelihood (static function)
00417 
00418    return d*TMath::Log((s+b)/b2);
00419 }