TRegexp.cxx

Go to the documentation of this file.
00001 // @(#)root/base:$Id: TRegexp.cxx 35844 2010-09-28 15:10:29Z rdm $
00002 // Author: Fons Rademakers   04/08/95
00003 
00004 /*************************************************************************
00005  * Copyright (C) 1995-2000, Rene Brun and Fons Rademakers.               *
00006  * All rights reserved.                                                  *
00007  *                                                                       *
00008  * For the licensing terms see $ROOTSYS/LICENSE.                         *
00009  * For the list of contributors see $ROOTSYS/README/CREDITS.             *
00010  *************************************************************************/
00011 
00012 //////////////////////////////////////////////////////////////////////////
00013 //                                                                      //
00014 // TRegexp                                                              //
00015 //                                                                      //
00016 // Regular expression class.                                            //
00017 //                                                                      //
00018 //   '^'             // start-of-line anchor                            //
00019 //   '$'             // end-of-line anchor                              //
00020 //   '.'             // matches any character                           //
00021 //   '['             // start a character class                         //
00022 //   ']'             // end a character class                           //
00023 //   '^'             // negates character class if 1st character        //
00024 //   '*'             // Kleene closure (matches 0 or more)              //
00025 //   '+'             // Positive closure (1 or more)                    //
00026 //   '?'             // Optional closure (0 or 1)                       //
00027 //                                                                      //
00028 //   Standard classes like [:alnum:], [:alpha:], etc. are not supported,//
00029 //   only [a-zA-Z], [^ntf] and so on.                                   //
00030 //                                                                      //
00031 //////////////////////////////////////////////////////////////////////////
00032 
00033 #include "TRegexp.h"
00034 #include "TString.h"
00035 #include "TError.h"
00036 
00037 const unsigned TRegexp::fgMaxpat = 2048;
00038 
00039 
00040 ClassImp(TRegexp)
00041 
00042 //______________________________________________________________________________
00043 TRegexp::TRegexp(const char *re, Bool_t wildcard)
00044 {
00045    // Create a regular expression from the input string. If wildcard is true
00046    // then the input string contains a wildcard expression (see MakeWildcard()).
00047 
00048    if (wildcard)
00049       GenPattern(MakeWildcard(re));
00050    else
00051       GenPattern(re);
00052 }
00053 
00054 //______________________________________________________________________________
00055 TRegexp::TRegexp(const TString& re)
00056 {
00057    // Create a regular expression from a TString.
00058 
00059    GenPattern(re.Data());
00060 }
00061 
00062 //______________________________________________________________________________
00063 TRegexp::TRegexp(const TRegexp& r)
00064 {
00065    // Copy ctor.
00066 
00067    CopyPattern(r);
00068 }
00069 
00070 //______________________________________________________________________________
00071 TRegexp::~TRegexp()
00072 {
00073    // Destructor.
00074    delete [] fPattern;
00075 }
00076 
00077 //______________________________________________________________________________
00078 TRegexp& TRegexp::operator=(const TRegexp& r)
00079 {
00080    // Assignment operator.
00081 
00082    if (this != &r) {
00083       delete [] fPattern;
00084       CopyPattern(r);
00085    }
00086    return *this;
00087 }
00088 
00089 //______________________________________________________________________________
00090 TRegexp& TRegexp::operator=(const char *str)
00091 {
00092    // Assignment operator taking a char* and assigning it to a regexp.
00093 
00094    delete [] fPattern;
00095    GenPattern(str);
00096    return *this;
00097 }
00098 
00099 //______________________________________________________________________________
00100 TRegexp& TRegexp::operator=(const TString &str)
00101 {
00102    // Assignment operator taking a TString.
00103 
00104    delete [] fPattern;
00105    GenPattern(str.Data());
00106    return *this;
00107 }
00108 
00109 //______________________________________________________________________________
00110 void TRegexp::GenPattern(const char *str)
00111 {
00112    // Generate the regular expression pattern.
00113 
00114    fPattern = new Pattern_t[fgMaxpat];
00115    int error = ::Makepat(str, fPattern, fgMaxpat);
00116    fStat = (error < 3) ? (EStatVal) error : kToolong;
00117 }
00118 
00119 //______________________________________________________________________________
00120 void TRegexp::CopyPattern(const TRegexp& r)
00121 {
00122    // Copy the regular expression pattern.
00123 
00124    fPattern = new Pattern_t[fgMaxpat];
00125    memcpy(fPattern, r.fPattern, fgMaxpat * sizeof(Pattern_t));
00126    fStat = r.fStat;
00127 }
00128 
00129 //______________________________________________________________________________
00130 const char *TRegexp::MakeWildcard(const char *re)
00131 {
00132    // This routine transforms a wildcarding regular expression into
00133    // a general regular expression used for pattern matching.
00134    // When using wildcards the regular expression is assumed to be
00135    // preceded by a "^" (BOL) and terminated by a "$" (EOL). Also, all
00136    // "*"'s and "?"'s (closures) are assumed to be preceded by a "." (i.e. any
00137    // character, except "/"'s) and all .'s are escaped (so *.ps is different
00138    // from *.eps). The special treatment of "/" allows the easy matching of
00139    // pathnames, e.g. "*.root" will match "aap.root", but not "pipo/aap.root".
00140 
00141    static char buf[fgMaxpat];
00142    char *s = buf;
00143    if (!re) return "";
00144    int len = strlen(re);
00145    int slen = 0;
00146 
00147    if (!len) return "";
00148 
00149    for (int i = 0; i < len; i++) {
00150       if ((unsigned)slen > fgMaxpat - 10) {
00151          Error("MakeWildcard", "regexp too large");
00152          break;
00153       }
00154       if (i == 0 && re[i] != '^') {
00155          *s++ = '^';
00156          slen++;
00157       }
00158       if (re[i] == '*') {
00159 #ifndef R__WIN32
00160          //const char *wc = "[a-zA-Z0-9-+_\\.,: []<>]";
00161          const char *wc = "[^/]";
00162 #else
00163          //const char *wc = "[a-zA-Z0-9-+_., []<>]";
00164          const char *wc = "[^\\/:]";
00165 #endif
00166          strcpy(s, wc);
00167          s += strlen(wc);
00168          slen += strlen(wc);
00169       }
00170       if (re[i] == '.') {
00171          *s++ = '\\';
00172          slen++;
00173       }
00174       if (re[i] == '?') {
00175 #ifndef R__WIN32
00176          //const char *wc = "[a-zA-Z0-9-+_\\.,: []<>]";
00177          const char *wc = "[^/]";
00178 #else
00179          //const char *wc = "[a-zA-Z0-9-+_., []<>]";
00180          const char *wc = "[^\\/:]";
00181 #endif
00182          strcpy(s, wc);
00183          s += strlen(wc);
00184          slen += strlen(wc);
00185       } else {
00186          *s++ = re[i];
00187          slen++;
00188       }
00189       if (i == len-1 && re[i] != '$') {
00190          *s++ = '$';
00191          slen++;
00192       }
00193    }
00194    *s = '\0';
00195    return buf;
00196 }
00197 
00198 //______________________________________________________________________________
00199 Ssiz_t TRegexp::Index(const TString& string, Ssiz_t* len, Ssiz_t i) const
00200 {
00201    // Find the first occurance of the regexp in string and return the position.
00202    // Len is length of the matched string and i is the offset at which the
00203    // matching should start.
00204 
00205    if (fStat != kOK)
00206       Error("TRegexp::Index", "Bad Regular Expression");
00207 
00208    const char* startp;
00209    const char* s = string.Data();
00210    Ssiz_t slen = string.Length();
00211    if (slen < i) return kNPOS;
00212    const char* endp = ::Matchs(s+i, slen-i, fPattern, &startp);
00213    if (endp) {
00214       *len = endp - startp;
00215       return startp - s;
00216    } else {
00217       *len = 0;
00218       return kNPOS;
00219    }
00220 }
00221 
00222 //______________________________________________________________________________
00223 TRegexp::EStatVal TRegexp::Status()
00224 {
00225    // Check status of regexp.
00226 
00227    EStatVal temp = fStat;
00228    fStat = kOK;
00229    return temp;
00230 }
00231 
00232 //////////////////////////////////////////////////////////////////////////
00233 //                                                                      //
00234 // TString member functions, put here so the linker will include        //
00235 // them only if regular expressions are used.                           //
00236 //                                                                      //
00237 //////////////////////////////////////////////////////////////////////////
00238 
00239 //______________________________________________________________________________
00240 Ssiz_t TString::Index(const TRegexp& r, Ssiz_t start) const
00241 {
00242    // Find the first occurance of the regexp in string and return the position.
00243    // Start is the offset at which the search should start.
00244 
00245    Ssiz_t len;
00246    return r.Index(*this, &len, start); // len not used
00247 }
00248 
00249 //______________________________________________________________________________
00250 Ssiz_t TString::Index(const TRegexp& r, Ssiz_t* extent, Ssiz_t start) const
00251 {
00252    // Find the first occurance of the regexp in string and return the position.
00253    // Extent is length of the matched string and start is the offset at which
00254    // the matching should start.
00255 
00256    return r.Index(*this, extent, start);
00257 }
00258 
00259 //______________________________________________________________________________
00260 TSubString TString::operator()(const TRegexp& r, Ssiz_t start) const
00261 {
00262    // Return the substring found by applying the regexp starting at start.
00263 
00264    Ssiz_t len;
00265    Ssiz_t begin = Index(r, &len, start);
00266    return TSubString(*this, begin, len);
00267 }
00268 
00269 //______________________________________________________________________________
00270 TSubString TString::operator()(const TRegexp& r) const
00271 {
00272    // Return the substring found by applying the regexp.
00273 
00274    return (*this)(r,0);
00275 }
00276 
00277 //__________________________________________________________________________________
00278 Bool_t TString::Tokenize(TString &tok, Ssiz_t &from, const char *delim) const
00279 {
00280    // Search for tokens delimited by regular expression 'delim' (default " ")
00281    // in this string; search starts at 'from' and the token is returned in 'tok'.
00282    // Returns in 'from' the next position after the delimiter.
00283    // Returns kTRUE if a token is found, kFALSE if not or if some inconsistency
00284    // occured.
00285    // This method allows to loop over tokens in this way:
00286    //
00287    //    TString myl = "tok1 tok2|tok3";
00288    //    TString tok;
00289    //    Ssiz_t from = 0;
00290    //    while (myl.Tokenize(tok, from, "[ |]")) {
00291    //       // Analyse tok
00292    //       ...
00293    //    }
00294    //
00295    // more convenient of the other Tokenize method when saving the tokens is not
00296    // needed.
00297 
00298    Bool_t found = kFALSE;
00299 
00300    // Reset the token
00301    tok = "";
00302 
00303    // Make sure inputs make sense
00304    Int_t len = Length();
00305    if (len <= 0 || from > (len - 1) || from < 0)
00306       return found;
00307 
00308    TRegexp rg(delim);
00309 
00310    while (tok.IsNull()) {
00311       // Find delimiter
00312       Int_t ext = 0;
00313       Int_t pos = Index(rg, &ext, from);
00314 
00315       // Assign to token
00316       if (pos == kNPOS || pos > from) {
00317          Ssiz_t last = (pos != kNPOS) ? (pos - 1) : len;
00318          tok = (*this)(from, last-from+1);
00319       }
00320       found = kTRUE;
00321 
00322       // Update start-of-search index
00323       from = pos + ext;
00324       if (pos == kNPOS) {
00325          from = pos;
00326          if (tok.IsNull()) {
00327             // Empty, last token
00328             found = kFALSE;
00329             break;
00330          }
00331       }
00332    }
00333    // Make sure that 'from' has a meaningful value
00334    from = (from < len) ? from : len;
00335 
00336    // Done
00337    return found;
00338 }

Generated on Tue Jul 5 14:11:23 2011 for ROOT_528-00b_version by  doxygen 1.5.1