00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026 #include "TMVA/ModulekNN.h"
00027
00028
00029 #include <assert.h>
00030 #include <iomanip>
00031 #include <iostream>
00032 #include <sstream>
00033 #include <algorithm>
00034
00035 #include "TMath.h"
00036
00037
00038 #include "TMVA/MsgLogger.h"
00039
00040
00041 TMVA::kNN::Event::Event()
00042 :fVar(),
00043 fWeight(-1.0),
00044 fType(-1)
00045 {
00046
00047 }
00048
00049
00050 TMVA::kNN::Event::Event(const VarVec &var, const Double_t weight, const Short_t type)
00051 :fVar(var),
00052 fWeight(weight),
00053 fType(type)
00054 {
00055
00056 }
00057
00058
00059 TMVA::kNN::Event::Event(const VarVec &var, const Double_t weight, const Short_t type, const VarVec &tvec)
00060 :fVar(var),
00061 fTgt(tvec),
00062 fWeight(weight),
00063 fType(type)
00064 {
00065
00066 }
00067
00068
00069 TMVA::kNN::Event::~Event()
00070 {
00071
00072 }
00073
00074
00075 TMVA::kNN::VarType TMVA::kNN::Event::GetDist(const Event &other) const
00076 {
00077
00078 const UInt_t nvar = GetNVar();
00079
00080 if (nvar != other.GetNVar()) {
00081 std::cerr << "Distance: two events have different dimensions" << std::endl;
00082 return -1.0;
00083 }
00084
00085 VarType sum = 0.0;
00086 for (UInt_t ivar = 0; ivar < nvar; ++ivar) {
00087 sum += GetDist(other.GetVar(ivar), ivar);
00088 }
00089
00090 return sum;
00091 }
00092
00093
00094 void TMVA::kNN::Event::SetTargets(const VarVec &tvec)
00095 {
00096 fTgt = tvec;
00097 }
00098
00099
00100 const TMVA::kNN::VarVec& TMVA::kNN::Event::GetTargets() const
00101 {
00102 return fTgt;
00103 }
00104
00105
00106 const TMVA::kNN::VarVec& TMVA::kNN::Event::GetVars() const
00107 {
00108 return fVar;
00109 }
00110
00111
00112 void TMVA::kNN::Event::Print() const
00113 {
00114
00115 Print(std::cout);
00116 }
00117
00118
00119 void TMVA::kNN::Event::Print(std::ostream& os) const
00120 {
00121
00122 Int_t dp = os.precision();
00123 os << "Event: ";
00124 for (UInt_t ivar = 0; ivar != GetNVar(); ++ivar) {
00125 if (ivar == 0) {
00126 os << "(";
00127 }
00128 else {
00129 os << ", ";
00130 }
00131
00132 os << std::setfill(' ') << std::setw(5) << std::setprecision(3) << GetVar(ivar);
00133 }
00134
00135 if (GetNVar() > 0) {
00136 os << ")";
00137 }
00138 else {
00139 os << " no variables";
00140 }
00141 os << std::setprecision(dp);
00142 }
00143
00144
00145 std::ostream& TMVA::kNN::operator<<(std::ostream& os, const TMVA::kNN::Event& event)
00146 {
00147
00148 event.Print(os);
00149 return os;
00150 }
00151
00152
00153
00154
00155
00156 TRandom3 TMVA::kNN::ModulekNN::fgRndm(1);
00157
00158
00159 TMVA::kNN::ModulekNN::ModulekNN()
00160 :fDimn(0),
00161 fTree(0),
00162 fLogger( new MsgLogger("ModulekNN") )
00163 {
00164
00165 }
00166
00167
00168 TMVA::kNN::ModulekNN::~ModulekNN()
00169 {
00170
00171 if (fTree) {
00172 delete fTree; fTree = 0;
00173 }
00174 delete fLogger;
00175 }
00176
00177
00178 void TMVA::kNN::ModulekNN::Clear()
00179 {
00180
00181 fDimn = 0;
00182
00183 if (fTree) {
00184 delete fTree;
00185 fTree = 0;
00186 }
00187
00188 fVarScale.clear();
00189 fCount.clear();
00190 fEvent.clear();
00191 fVar.clear();
00192 }
00193
00194
00195 void TMVA::kNN::ModulekNN::Add(const Event &event)
00196 {
00197
00198 if (fTree) {
00199 Log() << kFATAL << "<Add> Cannot add event: tree is already built" << Endl;
00200 return;
00201 }
00202
00203 if (fDimn < 1) {
00204 fDimn = event.GetNVar();
00205 }
00206 else if (fDimn != event.GetNVar()) {
00207 Log() << kFATAL << "ModulekNN::Add() - number of dimension does not match previous events" << Endl;
00208 return;
00209 }
00210
00211 fEvent.push_back(event);
00212
00213 for (UInt_t ivar = 0; ivar < fDimn; ++ivar) {
00214 fVar[ivar].push_back(event.GetVar(ivar));
00215 }
00216
00217 std::map<Short_t, UInt_t>::iterator cit = fCount.find(event.GetType());
00218 if (cit == fCount.end()) {
00219 fCount[event.GetType()] = 1;
00220 }
00221 else {
00222 ++(cit->second);
00223 }
00224 }
00225
00226
00227 Bool_t TMVA::kNN::ModulekNN::Fill(const UShort_t odepth, const UInt_t ifrac, const std::string &option)
00228 {
00229
00230 if (fTree) {
00231 Log() << kFATAL << "ModulekNN::Fill - tree has already been created" << Endl;
00232 return kFALSE;
00233 }
00234
00235
00236
00237 UInt_t min = 0;
00238 if (option.find("trim") != std::string::npos) {
00239 for (std::map<Short_t, UInt_t>::const_iterator it = fCount.begin(); it != fCount.end(); ++it) {
00240 if (min == 0 || min > it->second) {
00241 min = it->second;
00242 }
00243 }
00244
00245 Log() << kINFO << "<Fill> Will trim all event types to " << min << " events" << Endl;
00246
00247 fCount.clear();
00248 fVar.clear();
00249
00250 EventVec evec;
00251
00252 for (EventVec::const_iterator event = fEvent.begin(); event != fEvent.end(); ++event) {
00253 std::map<Short_t, UInt_t>::iterator cit = fCount.find(event->GetType());
00254 if (cit == fCount.end()) {
00255 fCount[event->GetType()] = 1;
00256 }
00257 else if (cit->second < min) {
00258 ++(cit->second);
00259 }
00260 else {
00261 continue;
00262 }
00263
00264 for (UInt_t d = 0; d < fDimn; ++d) {
00265 fVar[d].push_back(event->GetVar(d));
00266 }
00267
00268 evec.push_back(*event);
00269 }
00270
00271 Log() << kINFO << "<Fill> Erased " << fEvent.size() - evec.size() << " events" << Endl;
00272
00273 fEvent = evec;
00274 }
00275
00276
00277 fCount.clear();
00278
00279
00280 for (VarMap::iterator it = fVar.begin(); it != fVar.end(); ++it) {
00281 std::sort((it->second).begin(), (it->second).end());
00282 }
00283
00284 if (option.find("metric") != std::string::npos && ifrac > 0) {
00285 ComputeMetric(ifrac);
00286
00287
00288
00289 for (VarMap::iterator it = fVar.begin(); it != fVar.end(); ++it) {
00290 std::sort((it->second).begin(), (it->second).end());
00291 }
00292 }
00293
00294
00295
00296
00297
00298 fTree = Optimize(odepth);
00299
00300 if (!fTree) {
00301 Log() << kFATAL << "ModulekNN::Fill() - failed to create tree" << Endl;
00302 return kFALSE;
00303 }
00304
00305 for (EventVec::const_iterator event = fEvent.begin(); event != fEvent.end(); ++event) {
00306 fTree->Add(*event, 0);
00307
00308 std::map<Short_t, UInt_t>::iterator cit = fCount.find(event->GetType());
00309 if (cit == fCount.end()) {
00310 fCount[event->GetType()] = 1;
00311 }
00312 else {
00313 ++(cit->second);
00314 }
00315 }
00316
00317 for (std::map<Short_t, UInt_t>::const_iterator it = fCount.begin(); it != fCount.end(); ++it) {
00318 Log() << kINFO << "<Fill> Class " << it->first << " has " << std::setw(8)
00319 << it->second << " events" << Endl;
00320 }
00321
00322 return kTRUE;
00323 }
00324
00325
00326 Bool_t TMVA::kNN::ModulekNN::Find(Event event, const UInt_t nfind, const std::string &option) const
00327 {
00328
00329
00330
00331
00332
00333 if (!fTree) {
00334 Log() << kFATAL << "ModulekNN::Find() - tree has not been filled" << Endl;
00335 return kFALSE;
00336 }
00337 if (fDimn != event.GetNVar()) {
00338 Log() << kFATAL << "ModulekNN::Find() - number of dimension does not match training events" << Endl;
00339 return kFALSE;
00340 }
00341 if (nfind < 1) {
00342 Log() << kFATAL << "ModulekNN::Find() - requested 0 nearest neighbors" << Endl;
00343 return kFALSE;
00344 }
00345
00346
00347
00348 if (!fVarScale.empty()) {
00349 event = Scale(event);
00350 }
00351
00352
00353 fkNNEvent = event;
00354 fkNNList.clear();
00355
00356 if(option.find("weight") != std::string::npos)
00357 {
00358
00359
00360
00361 kNN::Find<kNN::Event>(fkNNList, fTree, event, Double_t(nfind), 0.0);
00362 }
00363 else
00364 {
00365
00366
00367 kNN::Find<kNN::Event>(fkNNList, fTree, event, nfind);
00368 }
00369
00370 return kTRUE;
00371 }
00372
00373
00374 Bool_t TMVA::kNN::ModulekNN::Find(const UInt_t nfind, const std::string &option) const
00375 {
00376
00377 if (fCount.empty() || !fTree) {
00378 return kFALSE;
00379 }
00380
00381 static std::map<Short_t, UInt_t>::const_iterator cit = fCount.end();
00382
00383 if (cit == fCount.end()) {
00384 cit = fCount.begin();
00385 }
00386
00387 const Short_t etype = (cit++)->first;
00388
00389 if (option == "flat") {
00390 VarVec dvec;
00391 for (UInt_t d = 0; d < fDimn; ++d) {
00392 VarMap::const_iterator vit = fVar.find(d);
00393 if (vit == fVar.end()) {
00394 return kFALSE;
00395 }
00396
00397 const std::vector<Double_t> &vvec = vit->second;
00398
00399 if (vvec.empty()) {
00400 return kFALSE;
00401 }
00402
00403
00404 const VarType min = vvec.front();
00405 const VarType max = vvec.back();
00406 const VarType width = max - min;
00407
00408 if (width < 0.0 || width > 0.0) {
00409 dvec.push_back(min + width*fgRndm.Rndm());
00410 }
00411 else {
00412 return kFALSE;
00413 }
00414 }
00415
00416 const Event event(dvec, 1.0, etype);
00417
00418 Find(event, nfind);
00419
00420 return kTRUE;
00421 }
00422
00423 return kFALSE;
00424 }
00425
00426
00427 TMVA::kNN::Node<TMVA::kNN::Event>* TMVA::kNN::ModulekNN::Optimize(const UInt_t odepth)
00428 {
00429
00430
00431
00432
00433 if (fVar.empty() || fDimn != fVar.size()) {
00434 Log() << kWARNING << "<Optimize> Cannot build a tree" << Endl;
00435 return 0;
00436 }
00437
00438 const UInt_t size = (fVar.begin()->second).size();
00439 if (size < 1) {
00440 Log() << kWARNING << "<Optimize> Cannot build a tree without events" << Endl;
00441 return 0;
00442 }
00443
00444 VarMap::const_iterator it = fVar.begin();
00445 for (; it != fVar.end(); ++it) {
00446 if ((it->second).size() != size) {
00447 Log() << kWARNING << "<Optimize> # of variables doesn't match between dimensions" << Endl;
00448 return 0;
00449 }
00450 }
00451
00452 if (double(fDimn*size) < TMath::Power(2.0, double(odepth))) {
00453 Log() << kWARNING << "<Optimize> Optimization depth exceeds number of events" << Endl;
00454 return 0;
00455 }
00456
00457 Log() << kINFO << "Optimizing tree for " << fDimn << " variables with " << size << " values" << Endl;
00458
00459 std::vector<Node<Event> *> pvec, cvec;
00460
00461 it = fVar.find(0);
00462 if (it == fVar.end() || (it->second).size() < 2) {
00463 Log() << kWARNING << "<Optimize> Missing 0 variable" << Endl;
00464 return 0;
00465 }
00466
00467 const Event pevent(VarVec(fDimn, (it->second)[size/2]), -1.0, -1);
00468
00469 Node<Event> *tree = new Node<Event>(0, pevent, 0);
00470
00471 pvec.push_back(tree);
00472
00473 for (UInt_t depth = 1; depth < odepth; ++depth) {
00474 const UInt_t mod = depth % fDimn;
00475
00476 VarMap::const_iterator vit = fVar.find(mod);
00477 if (vit == fVar.end()) {
00478 Log() << kFATAL << "Missing " << mod << " variable" << Endl;
00479 return 0;
00480 }
00481 const std::vector<Double_t> &dvec = vit->second;
00482
00483 if (dvec.size() < 2) {
00484 Log() << kFATAL << "Missing " << mod << " variable" << Endl;
00485 return 0;
00486 }
00487
00488 UInt_t ichild = 1;
00489 for (std::vector<Node<Event> *>::iterator pit = pvec.begin(); pit != pvec.end(); ++pit) {
00490 Node<Event> *parent = *pit;
00491
00492 const VarType lmedian = dvec[size*ichild/(2*pvec.size() + 1)];
00493 ++ichild;
00494
00495 const VarType rmedian = dvec[size*ichild/(2*pvec.size() + 1)];
00496 ++ichild;
00497
00498 const Event levent(VarVec(fDimn, lmedian), -1.0, -1);
00499 const Event revent(VarVec(fDimn, rmedian), -1.0, -1);
00500
00501 Node<Event> *lchild = new Node<Event>(parent, levent, mod);
00502 Node<Event> *rchild = new Node<Event>(parent, revent, mod);
00503
00504 parent->SetNodeL(lchild);
00505 parent->SetNodeR(rchild);
00506
00507 cvec.push_back(lchild);
00508 cvec.push_back(rchild);
00509 }
00510
00511 pvec = cvec;
00512 cvec.clear();
00513 }
00514
00515 return tree;
00516 }
00517
00518
00519 void TMVA::kNN::ModulekNN::ComputeMetric(const UInt_t ifrac)
00520 {
00521
00522
00523
00524
00525
00526 if (ifrac == 0) {
00527 return;
00528 }
00529 if (ifrac > 100) {
00530 Log() << kFATAL << "ModulekNN::ComputeMetric - fraction can not exceed 100%" << Endl;
00531 return;
00532 }
00533 if (!fVarScale.empty()) {
00534 Log() << kFATAL << "ModulekNN::ComputeMetric - metric is already computed" << Endl;
00535 return;
00536 }
00537 if (fEvent.size() < 100) {
00538 Log() << kFATAL << "ModulekNN::ComputeMetric - number of events is too small" << Endl;
00539 return;
00540 }
00541
00542 const UInt_t lfrac = (100 - ifrac)/2;
00543 const UInt_t rfrac = 100 - (100 - ifrac)/2;
00544
00545 Log() << kINFO << "Computing scale factor for 1d distributions: "
00546 << "(ifrac, bottom, top) = (" << ifrac << "%, " << lfrac << "%, " << rfrac << "%)" << Endl;
00547
00548 fVarScale.clear();
00549
00550 for (VarMap::const_iterator vit = fVar.begin(); vit != fVar.end(); ++vit) {
00551 const std::vector<Double_t> &dvec = vit->second;
00552
00553 std::vector<Double_t>::const_iterator beg_it = dvec.end();
00554 std::vector<Double_t>::const_iterator end_it = dvec.end();
00555
00556 Int_t dist = 0;
00557 for (std::vector<Double_t>::const_iterator dit = dvec.begin(); dit != dvec.end(); ++dit, ++dist) {
00558
00559 if ((100*dist)/dvec.size() == lfrac && beg_it == dvec.end()) {
00560 beg_it = dit;
00561 }
00562
00563 if ((100*dist)/dvec.size() == rfrac && end_it == dvec.end()) {
00564 end_it = dit;
00565 }
00566 }
00567
00568 if (beg_it == dvec.end() || end_it == dvec.end()) {
00569 beg_it = dvec.begin();
00570 end_it = dvec.end();
00571
00572 assert(beg_it != end_it && "Empty vector");
00573
00574 --end_it;
00575 }
00576
00577 const Double_t lpos = *beg_it;
00578 const Double_t rpos = *end_it;
00579
00580 if (!(lpos < rpos)) {
00581 Log() << kFATAL << "ModulekNN::ComputeMetric() - min value is greater than max value" << Endl;
00582 continue;
00583 }
00584
00585
00586
00587
00588
00589
00590
00591
00592 fVarScale[vit->first] = rpos - lpos;
00593 }
00594
00595 fVar.clear();
00596
00597 for (UInt_t ievent = 0; ievent < fEvent.size(); ++ievent) {
00598 fEvent[ievent] = Scale(fEvent[ievent]);
00599
00600 for (UInt_t ivar = 0; ivar < fDimn; ++ivar) {
00601 fVar[ivar].push_back(fEvent[ievent].GetVar(ivar));
00602 }
00603 }
00604 }
00605
00606
00607 const TMVA::kNN::Event TMVA::kNN::ModulekNN::Scale(const Event &event) const
00608 {
00609
00610
00611
00612 if (fVarScale.empty()) {
00613 return event;
00614 }
00615
00616 if (event.GetNVar() != fVarScale.size()) {
00617 Log() << kFATAL << "ModulekNN::Scale() - mismatched metric and event size" << Endl;
00618 return event;
00619 }
00620
00621 VarVec vvec(event.GetNVar(), 0.0);
00622
00623 for (UInt_t ivar = 0; ivar < event.GetNVar(); ++ivar) {
00624 std::map<int, Double_t>::const_iterator fit = fVarScale.find(ivar);
00625 if (fit == fVarScale.end()) {
00626 Log() << kFATAL << "ModulekNN::Scale() - failed to find scale for " << ivar << Endl;
00627 continue;
00628 }
00629
00630 if (fit->second > 0.0) {
00631 vvec[ivar] = event.GetVar(ivar)/fit->second;
00632 }
00633 else {
00634 Log() << kFATAL << "Variable " << ivar << " has zero width" << Endl;
00635 }
00636 }
00637
00638 return Event(vvec, event.GetWeight(), event.GetType(), event.GetTargets());
00639 }
00640
00641
00642 void TMVA::kNN::ModulekNN::Print() const
00643 {
00644
00645 Print(std::cout);
00646 }
00647
00648
00649 void TMVA::kNN::ModulekNN::Print(ostream &os) const
00650 {
00651
00652 os << "----------------------------------------------------------------------"<< std::endl;
00653 os << "Printing knn result" << std::endl;
00654 os << fkNNEvent << std::endl;
00655
00656 UInt_t count = 0;
00657
00658 std::map<Short_t, Double_t> min, max;
00659
00660 os << "Printing " << fkNNList.size() << " nearest neighbors" << std::endl;
00661 for (List::const_iterator it = fkNNList.begin(); it != fkNNList.end(); ++it) {
00662 os << ++count << ": " << it->second << ": " << it->first->GetEvent() << std::endl;
00663
00664 const Event &event = it->first->GetEvent();
00665 for (UShort_t ivar = 0; ivar < event.GetNVar(); ++ivar) {
00666 if (min.find(ivar) == min.end()) {
00667 min[ivar] = event.GetVar(ivar);
00668 }
00669 else if (min[ivar] > event.GetVar(ivar)) {
00670 min[ivar] = event.GetVar(ivar);
00671 }
00672
00673 if (max.find(ivar) == max.end()) {
00674 max[ivar] = event.GetVar(ivar);
00675 }
00676 else if (max[ivar] < event.GetVar(ivar)) {
00677 max[ivar] = event.GetVar(ivar);
00678 }
00679 }
00680 }
00681
00682 if (min.size() == max.size()) {
00683 for (std::map<Short_t, Double_t>::const_iterator mit = min.begin(); mit != min.end(); ++mit) {
00684 const Short_t i = mit->first;
00685 Log() << kINFO << "(var, min, max) = (" << i << "," << min[i] << ", " << max[i] << ")" << Endl;
00686 }
00687 }
00688
00689 os << "----------------------------------------------------------------------" << std::endl;
00690 }