TPRegexp.cxx

Go to the documentation of this file.
00001 // @(#)root/base:$Id: TPRegexp.cxx 37096 2010-11-30 12:21:36Z rdm $
00002 // Author: Eddy Offermann   24/06/05
00003 
00004 /*************************************************************************
00005  * Copyright (C) 1995-2005, Rene Brun and Fons Rademakers.               *
00006  * All rights reserved.                                                  *
00007  *                                                                       *
00008  * For the licensing terms see $ROOTSYS/LICENSE.                         *
00009  * For the list of contributors see $ROOTSYS/README/CREDITS.             *
00010  *************************************************************************/
00011 
00012 //////////////////////////////////////////////////////////////////////////
00013 //                                                                      //
00014 // TPRegexp                                                             //
00015 //                                                                      //
00016 // C++ Wrapper for the "Perl Compatible Regular Expressions" library    //
00017 //  The PCRE lib can be found at:                                       //
00018 //              http://www.pcre.org/                                    //
00019 //                                                                      //
00020 // Extensive documentation about Regular expressions in Perl can be     //
00021 // found at :                                                           //
00022 //              http://perldoc.perl.org/perlre.html                     //
00023 //                                                                      //
00024 //////////////////////////////////////////////////////////////////////////
00025 
00026 #include "Riostream.h"
00027 #include "TPRegexp.h"
00028 #include "TObjArray.h"
00029 #include "TObjString.h"
00030 #include "TError.h"
00031 
00032 #include <pcre.h>
00033 
00034 #include <vector>
00035 
00036 struct PCREPriv_t {
00037    pcre       *fPCRE;
00038    pcre_extra *fPCREExtra;
00039 
00040    PCREPriv_t() { fPCRE = 0; fPCREExtra = 0; }
00041 };
00042 
00043 
00044 ClassImp(TPRegexp)
00045 
00046 //______________________________________________________________________________
00047 TPRegexp::TPRegexp()
00048 {
00049    // Default ctor.
00050 
00051    fPriv     = new PCREPriv_t;
00052    fPCREOpts = 0;
00053 }
00054 
00055 //______________________________________________________________________________
00056 TPRegexp::TPRegexp(const TString &pat)
00057 {
00058    // Create and initialize with pat.
00059 
00060    fPattern  = pat;
00061    fPriv     = new PCREPriv_t;
00062    fPCREOpts = 0;
00063 }
00064 
00065 //______________________________________________________________________________
00066 TPRegexp::TPRegexp(const TPRegexp &p)
00067 {
00068    // Copy ctor.
00069 
00070    fPattern  = p.fPattern;
00071    fPriv     = new PCREPriv_t;
00072    fPCREOpts = p.fPCREOpts;
00073 }
00074 
00075 //______________________________________________________________________________
00076 TPRegexp::~TPRegexp()
00077 {
00078    // Cleanup.
00079 
00080    if (fPriv->fPCRE)
00081       pcre_free(fPriv->fPCRE);
00082    if (fPriv->fPCREExtra)
00083       pcre_free(fPriv->fPCREExtra);
00084    delete fPriv;
00085 }
00086 
00087 //______________________________________________________________________________
00088 TPRegexp &TPRegexp::operator=(const TPRegexp &p)
00089 {
00090    // Assignement operator.
00091 
00092    if (this != &p) {
00093       fPattern = p.fPattern;
00094       if (fPriv->fPCRE)
00095          pcre_free(fPriv->fPCRE);
00096       fPriv->fPCRE = 0;
00097       if (fPriv->fPCREExtra)
00098          pcre_free(fPriv->fPCREExtra);
00099       fPriv->fPCREExtra = 0;
00100       fPCREOpts  = p.fPCREOpts;
00101    }
00102    return *this;
00103 }
00104 
00105 //______________________________________________________________________________
00106 UInt_t TPRegexp::ParseMods(const TString &modStr) const
00107 {
00108    // Translate Perl modifier flags into pcre flags.
00109    // The supported modStr characters are: g, i, m, o, s, x, and the
00110    // special d for debug. The meaning of the letters is:
00111    // - m
00112    //   Treat string as multiple lines. That is, change "^" and "$" from
00113    //   matching the start or end of the string to matching the start or
00114    //   end of any line anywhere within the string.
00115    // - s
00116    //   Treat string as single line. That is, change "." to match any
00117    //   character whatsoever, even a newline, which normally it would not match.
00118    //   Used together, as /ms, they let the "." match any character whatsoever,
00119    //   while still allowing "^" and "$" to match, respectively, just after and
00120    //   just before newlines within the string.
00121    // - i
00122    //   Do case-insensitive pattern matching.
00123    // - x
00124    //   Extend your pattern's legibility by permitting whitespace and comments.
00125    // - p
00126    //   Preserve the string matched such that ${^PREMATCH}, ${^MATCH},
00127    //   and ${^POSTMATCH} are available for use after matching.
00128    // - g and c
00129    //   Global matching, and keep the Current position after failed matching.
00130    //   Unlike i, m, s and x, these two flags affect the way the regex is used
00131    //   rather than the regex itself. See Using regular expressions in Perl in
00132    //   perlretut for further explanation of the g and c modifiers.
00133    // For more detail see: http://perldoc.perl.org/perlre.html#Modifiers.
00134 
00135    UInt_t opts = 0;
00136 
00137    if (modStr.Length() <= 0)
00138       return fPCREOpts;
00139 
00140    //translate perl flags into pcre flags
00141    const char *m = modStr;
00142    while (*m) {
00143       switch (*m) {
00144          case 'g':
00145             opts |= kPCRE_GLOBAL;
00146             break;
00147          case 'i':
00148             opts |= PCRE_CASELESS;
00149             break;
00150          case 'm':
00151             opts |= PCRE_MULTILINE;
00152             break;
00153          case 'o':
00154             opts |= kPCRE_OPTIMIZE;
00155             break;
00156          case 's':
00157             opts |= PCRE_DOTALL;
00158             break;
00159          case 'x':
00160             opts |= PCRE_EXTENDED;
00161             break;
00162          case 'd': // special flag to enable debug printing (not Perl compat.)
00163             opts |= kPCRE_DEBUG_MSGS;
00164             break;
00165          default:
00166             Error("ParseMods", "illegal pattern modifier: %c", *m);
00167             opts = 0;
00168       }
00169       ++m;
00170    }
00171    return opts;
00172 }
00173 
00174 //______________________________________________________________________________
00175 TString TPRegexp::GetModifiers() const
00176 {
00177    // Return PCRE modifier options as string.
00178    // For meaning of mods see ParseMods().
00179 
00180    TString ret;
00181 
00182    if (fPCREOpts & kPCRE_GLOBAL)     ret += 'g';
00183    if (fPCREOpts & PCRE_CASELESS)    ret += 'i';
00184    if (fPCREOpts & PCRE_MULTILINE)   ret += 'm';
00185    if (fPCREOpts & PCRE_DOTALL)      ret += 's';
00186    if (fPCREOpts & PCRE_EXTENDED)    ret += 'x';
00187    if (fPCREOpts & kPCRE_OPTIMIZE)   ret += 'o';
00188    if (fPCREOpts & kPCRE_DEBUG_MSGS) ret += 'd';
00189 
00190    return ret;
00191 }
00192 
00193 //______________________________________________________________________________
00194 void TPRegexp::Compile()
00195 {
00196    // Compile the fPattern.
00197 
00198    if (fPriv->fPCRE)
00199       pcre_free(fPriv->fPCRE);
00200 
00201    if (fPCREOpts & kPCRE_DEBUG_MSGS)
00202       Info("Compile", "PREGEX compiling %s", fPattern.Data());
00203 
00204    const char *errstr;
00205    Int_t patIndex;
00206    fPriv->fPCRE = pcre_compile(fPattern.Data(), fPCREOpts & kPCRE_INTMASK,
00207                                &errstr, &patIndex, 0);
00208 
00209    if (!fPriv->fPCRE) {
00210       Error("Compile", "compilation of TPRegexp(%s) failed at: %d because %s",
00211             fPattern.Data(), patIndex, errstr);
00212    }
00213 
00214    if (fPriv->fPCREExtra || (fPCREOpts & kPCRE_OPTIMIZE))
00215       Optimize();
00216 }
00217 
00218 //______________________________________________________________________________
00219 void TPRegexp::Optimize()
00220 {
00221    // Send the pattern through the optimizer.
00222 
00223    if (fPriv->fPCREExtra)
00224       pcre_free(fPriv->fPCREExtra);
00225 
00226    if (fPCREOpts & kPCRE_DEBUG_MSGS)
00227       Info("Optimize", "PREGEX studying %s", fPattern.Data());
00228 
00229    const char *errstr;
00230    // pcre_study allows less options - see pcre_internal.h PUBLIC_STUDY_OPTIONS.
00231    fPriv->fPCREExtra = pcre_study(fPriv->fPCRE, 0, &errstr);
00232 
00233    if (!fPriv->fPCREExtra && errstr) {
00234       Error("Optimize", "Optimization of TPRegexp(%s) failed: %s",
00235             fPattern.Data(), errstr);
00236    }
00237 }
00238 
00239 //______________________________________________________________________________
00240 Int_t TPRegexp::ReplaceSubs(const TString &s, TString &final,
00241                             const TString &replacePattern,
00242                             Int_t *offVec, Int_t nrMatch) const
00243 {
00244    // Returns the number of expanded '$' constructs.
00245 
00246    Int_t nrSubs = 0;
00247    const char *p = replacePattern;
00248 
00249    Int_t state = 0;
00250    Int_t subnum = 0;
00251    while (state != -1) {
00252       switch (state) {
00253          case 0:
00254             if (!*p) {
00255                state = -1;
00256                break;
00257             }
00258             if (*p == '$') {
00259                state = 1;
00260                subnum = 0;
00261                if (p[1] == '&') {
00262                   p++;
00263                   if (isdigit(p[1]))
00264                      p++;
00265                } else if (!isdigit(p[1])) {
00266                   Error("ReplaceSubs", "badly formed replacement pattern: %s",
00267                         replacePattern.Data());
00268                }
00269             } else
00270                final += *p;
00271             break;
00272          case 1:
00273             if (isdigit(*p)) {
00274                subnum *= 10;
00275                subnum += (*p)-'0';
00276             } else {
00277                if (fPCREOpts & kPCRE_DEBUG_MSGS)
00278                   Info("ReplaceSubs", "PREGEX appending substr #%d", subnum);
00279                if (subnum < 0 || subnum > nrMatch-1) {
00280                   Error("ReplaceSubs","bad string number: %d",subnum);
00281                } else {
00282                   const TString subStr = s(offVec[2*subnum],offVec[2*subnum+1]-offVec[2*subnum]);
00283                   final += subStr;
00284                   nrSubs++;
00285                }
00286                state = 0;
00287                continue;  // send char to start state
00288             }
00289       }
00290       p++;
00291    }
00292    return nrSubs;
00293 }
00294 
00295 //______________________________________________________________________________
00296 Int_t TPRegexp::MatchInternal(const TString &s, Int_t start,
00297                               Int_t nMaxMatch, TArrayI *pos)
00298 {
00299    // Perform the actual matching - protected method.
00300 
00301    Int_t *offVec = new Int_t[3*nMaxMatch];
00302    // pcre_exec allows less options - see pcre_internal.h PUBLIC_EXEC_OPTIONS.
00303    Int_t nrMatch = pcre_exec(fPriv->fPCRE, fPriv->fPCREExtra, s.Data(),
00304                              s.Length(), start, 0,
00305                              offVec, 3*nMaxMatch);
00306 
00307    if (nrMatch == PCRE_ERROR_NOMATCH)
00308       nrMatch = 0;
00309    else if (nrMatch <= 0) {
00310       Error("Match","pcre_exec error = %d", nrMatch);
00311       delete [] offVec;
00312       return 0;
00313    }
00314 
00315    if (pos)
00316       pos->Set(2*nrMatch, offVec);
00317    delete [] offVec;
00318 
00319    return nrMatch;
00320 }
00321 
00322 //______________________________________________________________________________
00323 Int_t TPRegexp::Match(const TString &s, const TString &mods, Int_t start,
00324                       Int_t nMaxMatch, TArrayI *pos)
00325 {
00326    // The number of matches is returned, this equals the full match +
00327    // sub-pattern matches.
00328    // nMaxMatch is the maximum allowed number of matches.
00329    // pos contains the string indices of the matches. Its usage is
00330    // shown in the routine MatchS.
00331    // For meaning of mods see ParseMods().
00332 
00333    UInt_t opts = ParseMods(mods);
00334 
00335    if (!fPriv->fPCRE || opts != fPCREOpts) {
00336       fPCREOpts = opts;
00337       Compile();
00338    }
00339 
00340    return MatchInternal(s, start, nMaxMatch, pos);
00341 }
00342 
00343 
00344 //______________________________________________________________________________
00345 TObjArray *TPRegexp::MatchS(const TString &s, const TString &mods,
00346                             Int_t start, Int_t nMaxMatch)
00347 {
00348    // Returns a TObjArray of matched substrings as TObjString's.
00349    // The TObjArray is owner of the objects. The first entry is the full
00350    // matched pattern, followed by the subpatterns.
00351    // If a pattern was not matched, it will return an empty substring:
00352    //
00353    // TObjArray *subStrL = TPRegexp("(a|(z))(bc)").MatchS("abc");
00354    // for (Int_t i = 0; i < subStrL->GetLast()+1; i++) {
00355    //    const TString subStr = ((TObjString *)subStrL->At(i))->GetString();
00356    //    cout << "\"" << subStr << "\" ";
00357    // }
00358    // cout << subStr << endl;
00359    //
00360    // produces:  "abc" "a" "" "bc"
00361    // For meaning of mods see ParseMods().
00362 
00363    TArrayI pos;
00364    Int_t nrMatch = Match(s, mods, start, nMaxMatch, &pos);
00365 
00366    TObjArray *subStrL = new TObjArray();
00367    subStrL->SetOwner();
00368 
00369    for (Int_t i = 0; i < nrMatch; i++) {
00370       Int_t startp = pos[2*i];
00371       Int_t stopp  = pos[2*i+1];
00372       if (startp >= 0 && stopp >= 0) {
00373          const TString subStr = s(pos[2*i], pos[2*i+1]-pos[2*i]);
00374          subStrL->Add(new TObjString(subStr));
00375       } else
00376          subStrL->Add(new TObjString());
00377    }
00378 
00379    return subStrL;
00380 }
00381 
00382 //______________________________________________________________________________
00383 Int_t TPRegexp::SubstituteInternal(TString &s, const TString &replacePattern,
00384                                    Int_t start, Int_t nMaxMatch,
00385                                    Bool_t doDollarSubst)
00386 {
00387    // Perform pattern substitution with optional back-ref replacement
00388    // - protected method.
00389 
00390    Int_t *offVec = new Int_t[3*nMaxMatch];
00391 
00392    TString final;
00393    Int_t nrSubs = 0;
00394    Int_t offset = start;
00395    Int_t last = 0;
00396 
00397    while (kTRUE) {
00398 
00399       // find next matching subs
00400       // pcre_exec allows less options - see pcre_internal.h PUBLIC_EXEC_OPTIONS.
00401       Int_t nrMatch = pcre_exec(fPriv->fPCRE, fPriv->fPCREExtra, s.Data(),
00402                                 s.Length(), offset, 0,
00403                                 offVec, 3*nMaxMatch);
00404 
00405       if (nrMatch == PCRE_ERROR_NOMATCH) {
00406          nrMatch = 0;
00407          break;
00408       } else if (nrMatch <= 0) {
00409          Error("Substitute", "pcre_exec error = %d", nrMatch);
00410          break;
00411       }
00412 
00413       // append anything previously unmatched, but not substituted
00414       if (last <= offVec[0]) {
00415          final += s(last,offVec[0]-last);
00416          last = offVec[1];
00417       }
00418 
00419       // replace stuff in s
00420       if (doDollarSubst) {
00421          ReplaceSubs(s, final, replacePattern, offVec, nrMatch);
00422       } else {
00423          final += replacePattern;
00424       }
00425       ++nrSubs;
00426 
00427       // if global gotta check match at every pos
00428       if (!(fPCREOpts & kPCRE_GLOBAL))
00429          break;
00430 
00431       if (offVec[0] != offVec[1])
00432          offset = offVec[1];
00433       else {
00434          // matched empty string
00435          if (offVec[1] == s.Length())
00436          break;
00437          offset = offVec[1]+1;
00438       }
00439    }
00440 
00441    delete [] offVec;
00442 
00443    final += s(last,s.Length()-last);
00444    s = final;
00445 
00446    return nrSubs;
00447 }
00448 
00449 //______________________________________________________________________________
00450 Int_t TPRegexp::Substitute(TString &s, const TString &replacePattern,
00451                            const TString &mods, Int_t start, Int_t nMaxMatch)
00452 {
00453    // Substitute replaces the string s by a new string in which matching
00454    // patterns are replaced by the replacePattern string. The number of
00455    // substitutions are returned.
00456    //
00457    // TString s("aap noot mies");
00458    // const Int_t nrSub = TPRegexp("(\\w*) noot (\\w*)").Substitute(s,"$2 noot $1");
00459    // cout << nrSub << " \"" << s << "\"" <<endl;
00460    //
00461    // produces: 2 "mies noot aap"
00462    // For meaning of mods see ParseMods().
00463 
00464    UInt_t opts = ParseMods(mods);
00465 
00466    if (!fPriv->fPCRE || opts != fPCREOpts) {
00467       fPCREOpts = opts;
00468       Compile();
00469    }
00470 
00471    return SubstituteInternal(s, replacePattern, start, nMaxMatch, kTRUE);
00472 }
00473 
00474 
00475 //////////////////////////////////////////////////////////////////////////
00476 //                                                                      //
00477 // TString member functions, put here so the linker will include        //
00478 // them only if regular expressions are used.                           //
00479 //                                                                      //
00480 //////////////////////////////////////////////////////////////////////////
00481 
00482 //______________________________________________________________________________
00483 Ssiz_t TString::Index(TPRegexp& r, Ssiz_t start) const
00484 {
00485    // Find the first occurance of the regexp in string and return the position.
00486    // Start is the offset at which the search should start.
00487 
00488    TArrayI pos;
00489    Int_t nrMatch = r.Match(*this,"",start,10,&pos);
00490    if (nrMatch > 0)
00491       return pos[0];
00492    else
00493       return -1;
00494 }
00495 
00496 //______________________________________________________________________________
00497 Ssiz_t TString::Index(TPRegexp& r, Ssiz_t* extent, Ssiz_t start) const
00498 {
00499    // Find the first occurance of the regexp in string and return the position.
00500    // Extent is length of the matched string and start is the offset at which
00501    // the matching should start.
00502 
00503    TArrayI pos;
00504    const Int_t nrMatch = r.Match(*this,"",start,10,&pos);
00505    if (nrMatch > 0) {
00506       *extent = pos[1]-pos[0];
00507       return pos[0];
00508    } else {
00509       *extent = 0;
00510       return -1;
00511    }
00512 }
00513 
00514 //______________________________________________________________________________
00515 TSubString TString::operator()(TPRegexp& r, Ssiz_t start) const
00516 {
00517    // Return the substring found by applying the regexp starting at start.
00518 
00519    Ssiz_t len;
00520    Ssiz_t begin = Index(r, &len, start);
00521    return TSubString(*this, begin, len);
00522 }
00523 
00524 //______________________________________________________________________________
00525 TSubString TString::operator()(TPRegexp& r) const
00526 {
00527    // Return the substring found by applying the regexp.
00528 
00529    return (*this)(r, 0);
00530 }
00531 
00532 
00533 //////////////////////////////////////////////////////////////////////////
00534 // TPMERegexp
00535 //////////////////////////////////////////////////////////////////////////
00536 
00537 //______________________________________________________________________________
00538 //
00539 // Wrapper for PCRE library (Perl Compatible Regular Expressions).
00540 // Based on PME - PCRE Made Easy by Zachary Hansen.
00541 //
00542 // Supports main Perl operations using regular expressions (Match,
00543 // Substitute and Split). To retrieve the results one can simply use
00544 // operator[] returning a TString.
00545 //
00546 // See $ROOTSYS/tutorials/regexp_pme.C for examples.
00547 
00548 ClassImp(TPMERegexp);
00549 
00550 //______________________________________________________________________________
00551 TPMERegexp::TPMERegexp() :
00552    TPRegexp(),
00553    fNMaxMatches(10),
00554    fNMatches(0),
00555    fAddressOfLastString(0),
00556    fLastGlobalPosition(0)
00557 {
00558    // Default constructor. This regexp will match an empty string.
00559 
00560    Compile();
00561 }
00562 
00563 //______________________________________________________________________________
00564 TPMERegexp::TPMERegexp(const TString& s, const TString& opts, Int_t nMatchMax) :
00565    TPRegexp(s),
00566    fNMaxMatches(nMatchMax),
00567    fNMatches(0),
00568    fAddressOfLastString(0),
00569    fLastGlobalPosition(0)
00570 {
00571    // Constructor:
00572    //  s    - string to compile into regular expression
00573    //  opts - perl-style character flags to be set on TPME object
00574 
00575    fPCREOpts = ParseMods(opts);
00576    Compile();
00577 }
00578 
00579 //______________________________________________________________________________
00580 TPMERegexp::TPMERegexp(const TString& s, UInt_t opts, Int_t nMatchMax) :
00581    TPRegexp(s),
00582    fNMaxMatches(nMatchMax),
00583    fNMatches(0),
00584    fAddressOfLastString(0),
00585    fLastGlobalPosition(0)
00586 {
00587    // Constructor:
00588    //  s    - string to copmile into regular expression
00589    //  opts - PCRE-style option flags to be set on TPME object
00590 
00591    fPCREOpts = opts;
00592    Compile();
00593 }
00594 
00595 //______________________________________________________________________________
00596 TPMERegexp::TPMERegexp(const TPMERegexp& r) :
00597    TPRegexp(r),
00598    fNMaxMatches(r.fNMaxMatches),
00599    fNMatches(0),
00600    fAddressOfLastString(0),
00601    fLastGlobalPosition(0)
00602 {
00603    // Copy constructor.
00604    // Only PCRE specifics are copied, not last-match or global-matech
00605    // information.
00606 
00607    Compile();
00608 }
00609 
00610 //______________________________________________________________________________
00611 void TPMERegexp::Reset(const TString& s, const TString& opts, Int_t nMatchMax)
00612 {
00613    // Reset the patteren and options.
00614    // If 'nMatchMax' other than -1 (the default) is passed, it is also set.
00615 
00616    Reset(s, ParseMods(opts), nMatchMax);
00617 }
00618 
00619 //______________________________________________________________________________
00620 void TPMERegexp::Reset(const TString& s, UInt_t opts, Int_t nMatchMax)
00621 {
00622    // Reset the patteren and options.
00623    // If 'nMatchMax' other than -1 (the default) is passed, it is also set.
00624 
00625    fPattern = s;
00626    fPCREOpts = opts;
00627    Compile();
00628 
00629    if (nMatchMax != -1)
00630       fNMatches = nMatchMax;
00631    fNMatches = 0;
00632    fLastGlobalPosition = 0;
00633 }
00634 
00635 //______________________________________________________________________________
00636 void TPMERegexp::AssignGlobalState(const TPMERegexp& re)
00637 {
00638    // Copy global-match state from 're; so that this regexp can continue
00639    // parsing the string from where 're' left off.
00640    //
00641    // Alternatively, GetGlobalPosition() get be used to retrieve the
00642    // last match position so that it can passed to Match().
00643    //
00644    // Ideally, as it is done in PERL, the last match position would be
00645    // stored in the TString itself.
00646 
00647    fLastStringMatched  = re.fLastStringMatched;
00648    fLastGlobalPosition = re.fLastGlobalPosition;
00649 }
00650 
00651 //______________________________________________________________________________
00652 void TPMERegexp::ResetGlobalState()
00653 {
00654    // Reset state of global match.
00655    // This happens automatically when a new string is passed for matching.
00656    // But be carefull, as the address of last TString object is used
00657    // to make this decision.
00658 
00659    fLastGlobalPosition = 0;
00660 }
00661 
00662 //______________________________________________________________________________
00663 Int_t TPMERegexp::Match(const TString& s, UInt_t start)
00664 {
00665    // Runs a match on s against the regex 'this' was created with.
00666    //
00667    // Args:
00668    //  s        - string to match against
00669    //  start    - offset at which to start matching
00670    // Returns:  - number of matches found
00671 
00672    // If we got a new string, reset the global position counter.
00673    if (fAddressOfLastString != (void*) &s) {
00674       fLastGlobalPosition = 0;
00675    }
00676 
00677    if (fPCREOpts & kPCRE_GLOBAL) {
00678       start += fLastGlobalPosition;
00679    }
00680 
00681    //fprintf(stderr, "string: '%s' length: %d offset: %d\n", s.Data(), s.length(), offset);
00682    fNMatches = MatchInternal(s, start, fNMaxMatches, &fMarkers);
00683 
00684    //fprintf(stderr, "MatchInternal_exec result = %d\n", fNMatches);
00685 
00686    fLastStringMatched   = s;
00687    fAddressOfLastString = (void*) &s;
00688 
00689    if (fPCREOpts & kPCRE_GLOBAL) {
00690       if (fNMatches == PCRE_ERROR_NOMATCH) {
00691          // fprintf(stderr, "TPME RESETTING: reset for no match\n");
00692          fLastGlobalPosition = 0; // reset the position for next match (perl does this)
00693       } else if (fNMatches > 0) {
00694          // fprintf(stderr, "TPME RESETTING: setting to %d\n", marks[0].second);
00695          fLastGlobalPosition = fMarkers[1]; // set to the end of the match
00696       } else {
00697          // fprintf(stderr, "TPME RESETTING: reset for no unknown\n");
00698          fLastGlobalPosition = 0;
00699       }
00700    }
00701 
00702    return fNMatches;
00703 }
00704 
00705 //______________________________________________________________________________
00706 Int_t TPMERegexp::Split(const TString& s, Int_t maxfields)
00707 {
00708    // Splits into at most maxfields. If maxfields is unspecified or
00709    // 0, trailing empty matches are discarded. If maxfields is
00710    // positive, no more than maxfields fields will be returned and
00711    // trailing empty matches are preserved. If maxfields is empty,
00712    // all fields (including trailing empty ones) are returned. This
00713    // *should* be the same as the perl behaviour.
00714    //
00715    // If pattern produces sub-matches, these are also stored in
00716    // the result.
00717    //
00718    // A pattern matching the null string will split the value of EXPR
00719    // into separate characters at each point it matches that way.
00720    //
00721    // Args:
00722    //  s         - string to split
00723    //  maxfields - maximum number of fields to be split out.  0 means
00724    //              split all fields, but discard any trailing empty bits.
00725    //              Negative means split all fields and keep trailing empty bits.
00726    //              Positive means keep up to N fields including any empty fields
00727    //              less than N. Anything remaining is in the last field.
00728    // Returns:   - number of fields found
00729 
00730    typedef std::pair<int, int>   Marker_t;
00731    typedef std::vector<Marker_t> MarkerVec_t;
00732 
00733    // stores the marks for the split
00734    MarkerVec_t oMarks;
00735 
00736    // this is a list of current trailing empty matches if maxfields is
00737    //   unspecified or 0.  If there is stuff in it and a non-empty match
00738    //   is found, then everything in here is pushed into oMarks and then
00739    //   the new match is pushed on.  If the end of the string is reached
00740    //   and there are empty matches in here, they are discarded.
00741    MarkerVec_t oCurrentTrailingEmpties;
00742 
00743    Int_t nOffset = 0;
00744    Int_t nMatchesFound = 0;
00745 
00746    // while we are still finding matches and maxfields is 0 or negative
00747    //   (meaning we get all matches), or we haven't gotten to the number
00748    //   of specified matches
00749    Int_t matchRes;
00750    while ((matchRes = Match(s, nOffset)) &&
00751           ((maxfields < 1) || nMatchesFound < maxfields)) {
00752       ++nMatchesFound;
00753 
00754       if (fMarkers[1] - fMarkers[0] == 0) {
00755          oMarks.push_back(Marker_t(nOffset, nOffset + 1));
00756          ++nOffset;
00757          if (nOffset >= s.Length())
00758             break;
00759          else
00760             continue;
00761       }
00762 
00763       // match can be empty
00764       if (nOffset != fMarkers[0]) {
00765          if (!oCurrentTrailingEmpties.empty()) {
00766             oMarks.insert(oMarks.end(),
00767                           oCurrentTrailingEmpties.begin(),
00768                           oCurrentTrailingEmpties.end());
00769             oCurrentTrailingEmpties.clear();
00770          }
00771          oMarks.push_back(Marker_t(nOffset, fMarkers[0]));
00772       } else {
00773          // empty match
00774          if (maxfields == 0) {
00775             // store for possible later inclusion
00776             oCurrentTrailingEmpties.push_back(Marker_t(nOffset, nOffset));
00777          } else {
00778             oMarks.push_back(Marker_t(nOffset, nOffset));
00779          }
00780       }
00781 
00782       nOffset = fMarkers[1];
00783 
00784       if (matchRes > 1) {
00785          for (Int_t i = 1; i < matchRes; ++i)
00786             oMarks.push_back(Marker_t(fMarkers[2*i], fMarkers[2*i + 1]));
00787       }
00788    }
00789 
00790 
00791    // if there were no matches found, push the whole thing on
00792    if (nMatchesFound == 0) {
00793       oMarks.push_back(Marker_t(0, s.Length()));
00794    }
00795    // if we ran out of matches, then append the rest of the string
00796    //   onto the end of the last split field
00797    else if (maxfields > 0 && nMatchesFound >= maxfields) {
00798       oMarks[oMarks.size() - 1].second = s.Length();
00799    }
00800    // else we have to add another entry for the end of the string
00801    else {
00802       Bool_t last_empty = (nOffset == s.Length());
00803       if (!last_empty || maxfields < 0) {
00804          if (!oCurrentTrailingEmpties.empty()) {
00805             oMarks.insert(oMarks.end(),
00806                           oCurrentTrailingEmpties.begin(),
00807                           oCurrentTrailingEmpties.end());
00808          }
00809          oMarks.push_back(Marker_t(nOffset, s.Length()));
00810       }
00811    }
00812 
00813    fNMatches = oMarks.size();
00814    fMarkers.Set(2*fNMatches);
00815    for (Int_t i = 0; i < fNMatches; ++i) {
00816       fMarkers[2*i]     = oMarks[i].first;
00817       fMarkers[2*i + 1] = oMarks[i].second;
00818    }
00819 
00820    // fprintf(stderr, "match returning %d\n", fNMatches);
00821    return fNMatches;
00822 }
00823 
00824 //______________________________________________________________________________
00825 Int_t TPMERegexp::Substitute(TString& s, const TString& r, Bool_t doDollarSubst)
00826 {
00827    // Substitute matching part of s with r, dollar back-ref
00828    // substitution is performed if doDollarSubst is true (default).
00829    // Returns the number of substitutions made.
00830    //
00831    // After the substitution, another pass is made over the resulting
00832    // string and the following special tokens are interpreted:
00833    // \l - lowercase next char,
00834    // \u - uppercase next char,
00835    // \L - lowercase till \E,
00836    // \U - uppercase till \E, and
00837    // \E - end case modification.
00838 
00839    Int_t cnt = SubstituteInternal(s, r, 0, fNMaxMatches, doDollarSubst);
00840 
00841    TString ret;
00842    Int_t   state = 0;
00843    Ssiz_t  pos = 0, len = s.Length();
00844    const Char_t *data = s.Data();
00845    while (pos < len) {
00846       Char_t c = data[pos];
00847       if (c == '\\') {
00848          c = data[pos+1]; // Rely on string-data being null-terminated.
00849          switch (c) {
00850             case  0 : ret += '\\'; break;
00851             case 'l': state = 1;   break;
00852             case 'u': state = 2;   break;
00853             case 'L': state = 3;   break;
00854             case 'U': state = 4;   break;
00855             case 'E': state = 0;   break;
00856             default : ret += '\\'; ret += c; break;
00857          }
00858          pos += 2;
00859       } else {
00860          switch (state) {
00861             case 0:  ret += c; break;
00862             case 1:  ret += (Char_t) tolower(c); state = 0; break;
00863             case 2:  ret += (Char_t) toupper(c); state = 0; break;
00864             case 3:  ret += (Char_t) tolower(c); break;
00865             case 4:  ret += (Char_t) toupper(c); break;
00866             default: Error("TPMERegexp::Substitute", "invalid state.");
00867          }
00868          ++pos;
00869       }
00870    }
00871 
00872    s = ret;
00873 
00874    return cnt;
00875 }
00876 
00877 //______________________________________________________________________________
00878 TString TPMERegexp::operator[](int index)
00879 {
00880    // Returns the sub-string from the internal fMarkers vector.
00881    // Requires having run match or split first.
00882 
00883    if (index >= fNMatches)
00884       return "";
00885 
00886    Int_t begin = fMarkers[2*index];
00887    Int_t end   = fMarkers[2*index + 1];
00888    return fLastStringMatched(begin, end-begin);
00889 }
00890 
00891 //______________________________________________________________________________
00892 void TPMERegexp::Print(Option_t* option)
00893 {
00894    // Print the regular expression and modifier options.
00895    // If 'option' contains "all", prints also last string match and
00896    // match results.
00897 
00898    TString opt = option;
00899    opt.ToLower();
00900 
00901    Printf("Regexp='%s', Opts='%s'", fPattern.Data(), GetModifiers().Data());
00902    if (opt.Contains("all")) {
00903       Printf("  last string='%s'", fLastStringMatched.Data());
00904       Printf("  number of matches = %d", fNMatches);
00905       for (Int_t i=0; i<fNMatches; ++i)
00906          Printf("  %d - %s", i, operator[](i).Data());
00907    }
00908 }
00909 
00910 
00911 //////////////////////////////////////////////////////////////////////////
00912 //                                                                      //
00913 // TStringToken                                                         //
00914 //                                                                      //
00915 //////////////////////////////////////////////////////////////////////////
00916 
00917 //______________________________________________________________________________
00918 //
00919 // Provides iteration through tokens of a given string:
00920 //
00921 // - fFullStr     stores the string to be split. It is never modified.
00922 // - fSplitRe     is the perl-re that is used to separete the tokens.
00923 // - fReturnVoid  if true, empty strings will be returned.
00924 //
00925 // Current token is stored in the TString base-class.
00926 // During construction no match is done, use NextToken() to get the first
00927 // and all subsequent tokens.
00928 //
00929 
00930 ClassImp(TStringToken)
00931 
00932 //______________________________________________________________________________
00933 TStringToken::TStringToken(const TString& fullStr, const TString& splitRe, Bool_t retVoid) :
00934    fFullStr    (fullStr),
00935    fSplitRe    (splitRe),
00936    fReturnVoid (retVoid),
00937    fPos        (0)
00938 {
00939    // Constructor.
00940 }
00941 
00942 //______________________________________________________________________________
00943 Bool_t TStringToken::NextToken()
00944 {
00945    // Get the next token, it is stored in this TString.
00946    // Returns true if new token is available, false otherwise.
00947 
00948    TArrayI x;
00949    while (fPos < fFullStr.Length()) {
00950       if (fSplitRe.Match(fFullStr, "", fPos, 2, &x)) {
00951          TString::operator=(fFullStr(fPos, x[0] - fPos));
00952          fPos = x[1];
00953       } else {
00954          TString::operator=(fFullStr(fPos, fFullStr.Length() - fPos));
00955          fPos = fFullStr.Length() + 1;
00956       }
00957       if (Length() || fReturnVoid)
00958          return kTRUE;
00959    }
00960 
00961    // Special case: void-strings are requested and the full-string
00962    // ends with the separator. Thus we return another empty string.
00963    if (fPos == fFullStr.Length() && fReturnVoid) {
00964       TString::operator=("");
00965       fPos = fFullStr.Length() + 1;
00966       return kTRUE;
00967    }
00968 
00969    return kFALSE;
00970 }

Generated on Tue Jul 5 14:11:21 2011 for ROOT_528-00b_version by  doxygen 1.5.1