Match.cxx

Go to the documentation of this file.
00001 // @(#)root/base:$Id: Match.cxx 20877 2007-11-19 11:17:07Z rdm $
00002 // Author: Fons Rademakers   04/08/95
00003 
00004 /*************************************************************************
00005  * Copyright (C) 1995-2000, Rene Brun and Fons Rademakers.               *
00006  * All rights reserved.                                                  *
00007  *                                                                       *
00008  * For the licensing terms see $ROOTSYS/LICENSE.                         *
00009  * For the list of contributors see $ROOTSYS/README/CREDITS.             *
00010  *************************************************************************/
00011 
00012 //////////////////////////////////////////////////////////////////////////
00013 //                                                                      //
00014 // Author:    Allen I. Holub                                            //
00015 //                                                                      //
00016 // (c) C Gazette. May be used freely as long as author and publication  //
00017 // are acknowledged.                                                    //
00018 //                                                                      //
00019 //////////////////////////////////////////////////////////////////////////
00020 
00021 #include <stdio.h>
00022 #include <ctype.h>
00023 #include <string.h>
00024 
00025 
00026 #include "Match.h"
00027 
00028 
00029 // Metacharacters in the input:
00030 #define BOL     '^'             // start-of-line anchor
00031 #define EOL     '$'             // end-of-line anchor
00032 #define ANY     '.'             // matches any character
00033 #define CCL     '['             // start a character class
00034 #define CCLEND  ']'             // end a character class
00035 #define NCCL    '^'             // negates character class if 1st character
00036 #define CLOSURE '*'             // Kleene closure (matches 0 or more)
00037 #define PCLOSE  '+'             // Positive closure (1 or more)
00038 #define OPT     '?'             // Optional closure (0 or 1)
00039 
00040 enum Eaction {                  // These are put in the pattern string
00041                                 // to represent metacharacters.
00042    kBOL    = (0x8000 | '^'),
00043    kEOL    = (0x8000 | '$'),
00044    kANY    = (0x8000 | '.'),
00045    kCCL    = (0x8000 | '['),
00046    kOPT    = (0x8000 | '?'),
00047    kCLOSE  = (0x8000 | '*'),
00048    kPCLOSE = (0x8000 | '+'),
00049    kEND    = (0x8000 | 0)      // end of pattern
00050 
00051 };
00052 
00053 #if 1
00054 #define ISHEXDIGIT(x) isxdigit((unsigned char)x)
00055 #else
00056 #define ISHEXDIGIT(x) (isdigit(x)                       \
00057                             || ('a'<=(x) && (x)<='f')   \
00058                             || ('A'<=(x) && (x)<='F')   )
00059 #endif
00060 
00061 #define ISOCTDIGIT(x) ('0'<=(x) && (x)<='7')
00062 
00063 // ----------------------------------------------------------------------
00064 #define MAPSIZE 16              // need this many Pattern_t elements for
00065                                 // character class bit map
00066 
00067 //______________________________________________________________________________
00068 inline void ADVANCE(const Pattern_t*& pat)
00069 {
00070    // Advance a pointer into the pattern template to the next pattern element,
00071    // this is a+1 for all pattern elements but kCCL, where you have to skip
00072    // past both the kCCL character and the bitmap that follows that character.
00073 
00074    if (*pat++ == (Pattern_t) kCCL)
00075       pat += MAPSIZE;
00076 }
00077 
00078 //
00079 // Bitmap functions. Set bit b in the map and
00080 // test bit b to see if it was set previously.
00081 //
00082 
00083 //______________________________________________________________________________
00084 #ifdef SETBIT // from Rtypes.h
00085 #undef SETBIT
00086 #endif
00087 
00088 static void SETBIT(unsigned char b, Pattern_t* map)
00089 {
00090    map[b >> 4] |= 1 << (b & 0x0f);
00091 }
00092 
00093 //______________________________________________________________________________
00094 static int TSTBIT(unsigned char b, const Pattern_t* map)
00095 {
00096    return map[b >> 4] & (1 << (b & 0x0f));
00097 }
00098 
00099 // ----------------------------------------------------------------------
00100 
00101 #define E_NONE       0          // Possible return values from pat_err
00102 #define E_ILLEGAL    1          // Set in Makepat() to indicate prob-
00103 #define E_NOMEM      2          // lems that came up while making the
00104 #define E_PAT        3          // pattern template.
00105 
00106 // ----------------------------------------------------------------------
00107 
00108 static const char*  doccl(Pattern_t*, const char*);
00109 static int          hex2bin(int);
00110 static int          oct2bin(int);
00111 static int          omatch(const char**, size_t*, const Pattern_t*, const char*);
00112 static const char*  patcmp(const char*, size_t, const Pattern_t*, const char*);
00113 static int          esc(const char**);
00114 
00115 // ----------------------------------------------------------------------
00116 
00117 //______________________________________________________________________________
00118 int Makepat(const char*     exp,        // Regular expression
00119             Pattern_t*      pat,        // Assembled compiled pattern
00120             int             maxpat)     // Length of pat
00121 {
00122    // Make a pattern template from the string pointed to by exp. Stop when
00123    // '\0' is found in exp.  The pattern template is assembled
00124    // in pat whose length is given by maxpat.
00125    //
00126    // Return:
00127    // E_ILLEGAL       Illegal input pattern.
00128    // E_NOMEM         out of memory.
00129    // E_PAT           pattern too long.
00130 
00131    Pattern_t* cur;           // pointer to current pattern element
00132    Pattern_t* prev;          // pointer to previous pattern element
00133    int        theError = E_ILLEGAL;
00134 
00135    if (!*exp)
00136       goto exit;
00137 
00138    if (*exp == CLOSURE || *exp == PCLOSE || *exp == OPT)
00139       goto exit;
00140 
00141    theError = E_NOMEM;
00142    if (!pat) goto exit;          // Check for bad pat
00143 
00144    prev = cur = pat;
00145    theError = E_PAT;
00146 
00147    while (*exp) {
00148 
00149       if (cur >= &pat[maxpat - 1]) goto exit;
00150 
00151       switch (*exp) {
00152       case ANY:
00153          *cur = (Pattern_t)kANY;
00154          prev = cur++;
00155          ++exp;
00156          break;
00157 
00158       case BOL:
00159          *cur = (cur == pat) ? (Pattern_t)kBOL : (unsigned char)*exp;
00160          prev = cur++;
00161          ++exp;
00162          break;
00163 
00164       case EOL:
00165          *cur = (!exp[1]) ? (Pattern_t)kEOL : (unsigned char)*exp;
00166          prev = cur++;
00167          ++exp;
00168          break;
00169 
00170       case CCL:
00171          if (((cur - pat) + MAPSIZE) >= maxpat)
00172             goto exit;              // not enough room for bit map
00173          prev = cur;
00174          *cur++ = (Pattern_t)kCCL;
00175          exp = doccl(cur, exp);
00176          if (*exp != CCLEND) goto exit;
00177          ++exp;
00178          cur += MAPSIZE;
00179          break;
00180 
00181       case OPT:
00182       case CLOSURE:
00183       case PCLOSE:
00184          switch (*prev) {
00185          case kBOL:
00186          case kEOL:
00187          case kOPT:
00188          case kPCLOSE:
00189          case kCLOSE:
00190             goto exit;
00191          }
00192 
00193          memmove( prev+1, prev, (cur-prev)*sizeof(*cur));
00194          *prev = (Pattern_t) (*exp == OPT) ?    kOPT :
00195                              (*exp == PCLOSE) ? kPCLOSE :
00196                                                 kCLOSE;
00197          ++cur;
00198          ++exp;
00199          break;
00200 
00201       default:
00202          prev = cur;
00203          *cur++ = esc(&exp);
00204          break;
00205       }
00206    }
00207 
00208    *cur = (Pattern_t)kEND;
00209    theError = E_NONE;
00210 
00211 exit:
00212    return theError;
00213 }
00214 
00215 //______________________________________________________________________________
00216 const char *Matchs(const char*       str,
00217                    size_t            slen,
00218                    const Pattern_t*  pat,
00219                    const char**      startpat)
00220 {
00221    // Match a string with a pattern.
00222 
00223    if (!pat) return 0;
00224    const char* endp = 0;
00225    if (*pat == (Pattern_t)kBOL) {
00226       // the rest has to match directly
00227       endp = patcmp(str, slen, pat+1, str);
00228    } else {
00229       // scoot along the string until it matches, or no more string
00230       const char* start = str;
00231       while ((endp = patcmp(str, slen, pat, start)) == 0 && slen != 0)
00232          ++str, --slen;
00233    }
00234    *startpat = str;
00235    return endp;
00236 }
00237 
00238 //______________________________________________________________________________
00239 static const char *doccl(Pattern_t*  map, const char* src)
00240 {
00241    // Set bits in the map corresponding to characters specified in the src
00242    // character class.
00243 
00244    int negative;
00245 
00246    ++src;                        // skip past the [
00247    negative = (*src == NCCL);
00248    if (negative)                 // check for negative ccl
00249       ++src;
00250    memset(map, 0, MAPSIZE*sizeof(*map)); // bitmap initially empty
00251 
00252    while (*src && *src != CCLEND) {
00253       int isCCL = (*src == CCL);  // support [] in pattern, don't put single [ before closing ]
00254       unsigned char first = esc(&src);
00255       SETBIT(first, map);
00256       if (isCCL && *src && *src == CCLEND) {
00257          first = esc(&src);
00258          SETBIT(first, map);
00259       }
00260       if (*src == '-' && src[1] && src[1] != CCLEND) {
00261          ++src;                    // skip to end-of-sequence char
00262          unsigned char last = esc(&src);
00263          if (first <= last)  while (first < last ) SETBIT(++first, map);
00264          else                while (last  < first) SETBIT(last++,  map);
00265       }
00266    }
00267 
00268    if (negative) {
00269       for (int i = MAPSIZE; --i >= 0;)
00270          *map++ ^= ~0;             // invert all bits
00271    }
00272 
00273    return src;
00274 }
00275 
00276 //______________________________________________________________________________
00277 static const char *patcmp(const char*      str,
00278                           size_t           slen,
00279                           const Pattern_t* pat,
00280                           const char*      start)
00281 {
00282    // Like strcmp, but compares str against pat. Each element of str is
00283    // compared with the template until either a mis-match is found or the end
00284    // of the template is reached. In the former case a 0 is returned; in the
00285    // latter, a pointer into str (pointing after the last character in the
00286    // matched pattern) is returned. start points at the first character in
00287    // the string, which might not be the same thing as str if the search
00288    // started in the middle of the string.
00289 
00290    if (!pat)                     // make sure pattern is valid
00291       return 0;
00292 
00293    while (*pat != (Pattern_t)kEND) {
00294       if (*pat == (Pattern_t)kOPT) {
00295 
00296          // Zero or one matches. It doesn't matter if omatch fails---it will
00297          // advance str past the character on success, though. Always advance
00298          // the pattern past both the kOPT and the operand.
00299          omatch(&str, &slen, ++pat, start);
00300          ADVANCE(pat);
00301 
00302       } else if (*pat != (Pattern_t)kCLOSE &&
00303                  *pat != (Pattern_t)kPCLOSE)    {
00304 
00305          // Do a simple match. Note that omatch() fails if there's still
00306          // something in pat but we're at end of string.
00307 
00308          if (!omatch(&str, &slen, pat, start))
00309             return 0;
00310          ADVANCE(pat);
00311 
00312       } else {                    // Process a Kleene or positive closure
00313 
00314          if (*pat++ == (Pattern_t)kPCLOSE)    // one match required
00315             if (!omatch(&str, &slen, pat, start))
00316                return 0;
00317 
00318          // Match as many as possible, zero is okay
00319 
00320          const char* bocl = str;           // beginning of closure string.
00321          while (slen && omatch(&str, &slen, pat, start))
00322             ;
00323          ADVANCE(pat);  // skip over the closure
00324          if (*pat == (Pattern_t)kEND)
00325             break;
00326 
00327          // 'str' now points to the character that made made us fail. Try to
00328          // process the rest of the string. If the character following the
00329          // closure could have been in the closure (as in the pattern "[a-z]*t")
00330          // the final 't' will be sucked up in the while loop. So, if the match
00331          // fails, back up a notch and try to match the rest of the string
00332          // again, repeating this process until we get back to the
00333          // beginning of the closure. The recursion goes as many levels
00334          // deep as there are closures in the pattern.
00335 
00336          const char* end;
00337          while ((end = patcmp(str, slen, pat, start)) == 0) {
00338             ++slen, --str;
00339             if (str < bocl) break;
00340          }
00341          return end;
00342 
00343       }  // closure
00344    }  // while (*pat != kEND)
00345 
00346    //
00347    // omatch() advances str to point at the next character to be matched. So
00348    // str points at the character following the last character matched when
00349    // you reach the end of the template. The exceptions are templates
00350    // containing only a BOLN or EOLN token. In these cases omatch doesn't
00351    // advance.
00352    //
00353 
00354    return str;
00355 }
00356 
00357 //______________________________________________________________________________
00358 static int omatch(const char**      strp,
00359                   size_t*           slenp,
00360                   const Pattern_t*  pat,
00361                   const char*       start)
00362 {
00363    // Match one pattern element, pointed at by pat, against the character at
00364    // **strp. Return 0 on a failure, 1 on success. *strp is advanced to skip
00365    // over the matched character on a successful match. Closure is handled one
00366    // level up by patcmp().
00367    //
00368    // "start" points at the character at the left edge of the line. This might
00369    // not be the same thing as *strp if the search is starting in the middle
00370    // of the string. An end-of- line anchor matches end of string only.
00371 
00372    switch (*pat) {
00373    // Match beginning of line
00374    case kBOL:   return (*strp == start);
00375 
00376    // Match end of line
00377    case kEOL:   return (*slenp == 0);
00378 
00379    // Notice: cases above don't advance.
00380    // Match any except newline
00381    case kANY: if (**strp == '\n') return 0;
00382       break;
00383 
00384    // Set match
00385    case kCCL: if (*slenp == 0 || !TSTBIT(**strp, pat + 1)) return 0;
00386       break;
00387 
00388    // Literal match
00389    default:    if (*slenp == 0 || (unsigned char) **strp != *pat)  return 0;
00390       break;
00391    }
00392 
00393    if (*slenp) {
00394       ++*strp;
00395       --*slenp;
00396    }
00397    return 2;
00398 }
00399 
00400 //______________________________________________________________________________
00401 static int hex2bin(int c)
00402 {
00403    // Convert the hex digit represented by 'c' to an int. 'c'
00404    // must be one of: 0123456789abcdefABCDEF
00405 
00406    return (isdigit(c) ? (c)-'0': ((toupper((unsigned char)c))-'A')+10)  & 0xf;
00407 }
00408 
00409 //______________________________________________________________________________
00410 static int oct2bin(int c)
00411 {
00412    // Convert the hex digit represented by 'c' to an int. 'c'
00413    // must be a digit in the range '0'-'7'.
00414 
00415    return ( ((c)-'0')  &  0x7 );
00416 }
00417 
00418 //______________________________________________________________________________
00419 static int esc(const char** s)
00420 {
00421    // Map escape sequences into their equivalent symbols. Return
00422    // the equivalent ASCII character. *s is advanced past the
00423    // escape sequence. If no escape sequence is present, the
00424    // current character is returned and the string is advanced by
00425    // one. The following are recognized:
00426    //
00427    //  \b     backspace
00428    //  \f     formfeed
00429    //  \n     newline
00430    //  \r     carriage return
00431    //  \s     space
00432    //  \t     tab
00433    //  \e     ASCII ESC character ('\033')
00434    //  \DDD   number formed of 1-3 octal digits
00435    //  \xDD   number formed of 1-2 hex digits
00436    //  \^C    C = any letter. Control code
00437 
00438    int rval;
00439 
00440    if (**s != '\\')
00441       rval = *((*s)++);
00442    else {
00443       ++(*s);                                 // Skip the backslash (\)
00444       switch (toupper((unsigned char)**s)) {
00445       case '\0':  rval = '\\';             break;
00446       case 'B':   rval = '\b';             break;
00447       case 'F':   rval = '\f';             break;
00448       case 'N':   rval = '\n';             break;
00449       case 'R':   rval = '\r';             break;
00450       case 'S':   rval = ' ';              break;
00451       case 'T':   rval = '\t';             break;
00452       case 'E':   rval = '\033';           break;
00453 
00454       case '^':
00455          rval = *++(*s) ;
00456          rval = toupper((unsigned char)rval) - '@';
00457          break;
00458 
00459       case 'X':
00460          rval = 0;
00461          ++(*s);
00462          if (ISHEXDIGIT(**s)) {
00463             rval  = hex2bin((unsigned char) *(*s)++);
00464          }
00465          if (ISHEXDIGIT(**s)) {
00466             rval <<= 4;
00467             rval  |= hex2bin((unsigned char) *(*s)++);
00468          }
00469          --(*s);
00470          break;
00471 
00472       default:
00473          if (!ISOCTDIGIT(**s))
00474             rval = **s;
00475          else {
00476             rval = oct2bin(*(*s)++);
00477             if (ISOCTDIGIT(**s)) {
00478                rval <<= 3;
00479                rval  |= oct2bin(*(*s)++);
00480             }
00481             if (ISOCTDIGIT(**s)) {
00482                rval <<= 3;
00483                rval  |= oct2bin(*(*s)++);
00484             }
00485             --(*s);
00486          }
00487          break;
00488       }
00489    ++(*s);
00490    }
00491    return (unsigned char)rval;
00492 }

Generated on Tue Jul 5 14:11:18 2011 for ROOT_528-00b_version by  doxygen 1.5.1