pcrecpp.cc

Go to the documentation of this file.
00001 // Copyright (c) 2005, Google Inc.
00002 // All rights reserved.
00003 //
00004 // Redistribution and use in source and binary forms, with or without
00005 // modification, are permitted provided that the following conditions are
00006 // met:
00007 //
00008 //     * Redistributions of source code must retain the above copyright
00009 // notice, this list of conditions and the following disclaimer.
00010 //     * Redistributions in binary form must reproduce the above
00011 // copyright notice, this list of conditions and the following disclaimer
00012 // in the documentation and/or other materials provided with the
00013 // distribution.
00014 //     * Neither the name of Google Inc. nor the names of its
00015 // contributors may be used to endorse or promote products derived from
00016 // this software without specific prior written permission.
00017 //
00018 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00019 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00020 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
00021 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
00022 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
00023 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
00024 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
00025 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
00026 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
00027 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
00028 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00029 //
00030 // Author: Sanjay Ghemawat
00031 
00032 #ifdef HAVE_CONFIG_H
00033 #include "config.h"
00034 #endif
00035 
00036 #include <stdlib.h>
00037 #include <stdio.h>
00038 #include <ctype.h>
00039 #include <limits.h>      /* for SHRT_MIN, USHRT_MAX, etc */
00040 #include <assert.h>
00041 #include <errno.h>
00042 #include <string>
00043 #include <algorithm>
00044 
00045 #include "pcrecpp_internal.h"
00046 #include "pcre.h"
00047 #include "pcrecpp.h"
00048 #include "pcre_stringpiece.h"
00049 
00050 
00051 namespace pcrecpp {
00052 
00053 // Maximum number of args we can set
00054 static const int kMaxArgs = 16;
00055 static const int kVecSize = (1 + kMaxArgs) * 3;  // results + PCRE workspace
00056 
00057 // Special object that stands-in for no argument
00058 Arg RE::no_arg((void*)NULL);
00059 
00060 // This is for ABI compatibility with old versions of pcre (pre-7.6),
00061 // which defined a global no_arg variable instead of putting it in the
00062 // RE class.  This works on GCC >= 3, at least.  It definitely works
00063 // for ELF, but may not for other object formats (Mach-O, for
00064 // instance, does not support aliases.)  We could probably have a more
00065 // inclusive test if we ever needed it.  (Note that not only the
00066 // __attribute__ syntax, but also __USER_LABEL_PREFIX__, are
00067 // gnu-specific.)
00068 #if defined(__GNUC__) && __GNUC__ >= 3 && defined(__ELF__)
00069 # define ULP_AS_STRING(x)            ULP_AS_STRING_INTERNAL(x)
00070 # define ULP_AS_STRING_INTERNAL(x)   #x
00071 # define USER_LABEL_PREFIX_STR       ULP_AS_STRING(__USER_LABEL_PREFIX__)
00072 extern Arg no_arg
00073   __attribute__((alias(USER_LABEL_PREFIX_STR "_ZN7pcrecpp2RE6no_argE")));
00074 #endif
00075 
00076 // If a regular expression has no error, its error_ field points here
00077 static const string empty_string;
00078 
00079 // If the user doesn't ask for any options, we just use this one
00080 static RE_Options default_options;
00081 
00082 void RE::Init(const string& pat, const RE_Options* options) {
00083   pattern_ = pat;
00084   if (options == NULL) {
00085     options_ = default_options;
00086   } else {
00087     options_ = *options;
00088   }
00089   error_ = &empty_string;
00090   re_full_ = NULL;
00091   re_partial_ = NULL;
00092 
00093   re_partial_ = Compile(UNANCHORED);
00094   if (re_partial_ != NULL) {
00095     re_full_ = Compile(ANCHOR_BOTH);
00096   }
00097 }
00098 
00099 void RE::Cleanup() {
00100   if (re_full_ != NULL)         (*pcre_free)(re_full_);
00101   if (re_partial_ != NULL)      (*pcre_free)(re_partial_);
00102   if (error_ != &empty_string)  delete error_;
00103 }
00104 
00105 
00106 RE::~RE() {
00107   Cleanup();
00108 }
00109 
00110 
00111 pcre* RE::Compile(Anchor anchor) {
00112   // First, convert RE_Options into pcre options
00113   int pcre_options = 0;
00114   pcre_options = options_.all_options();
00115 
00116   // Special treatment for anchoring.  This is needed because at
00117   // runtime pcre only provides an option for anchoring at the
00118   // beginning of a string (unless you use offset).
00119   //
00120   // There are three types of anchoring we want:
00121   //    UNANCHORED      Compile the original pattern, and use
00122   //                    a pcre unanchored match.
00123   //    ANCHOR_START    Compile the original pattern, and use
00124   //                    a pcre anchored match.
00125   //    ANCHOR_BOTH     Tack a "\z" to the end of the original pattern
00126   //                    and use a pcre anchored match.
00127 
00128   const char* compile_error;
00129   int eoffset;
00130   pcre* re;
00131   if (anchor != ANCHOR_BOTH) {
00132     re = pcre_compile(pattern_.c_str(), pcre_options,
00133                       &compile_error, &eoffset, NULL);
00134   } else {
00135     // Tack a '\z' at the end of RE.  Parenthesize it first so that
00136     // the '\z' applies to all top-level alternatives in the regexp.
00137     string wrapped = "(?:";  // A non-counting grouping operator
00138     wrapped += pattern_;
00139     wrapped += ")\\z";
00140     re = pcre_compile(wrapped.c_str(), pcre_options,
00141                       &compile_error, &eoffset, NULL);
00142   }
00143   if (re == NULL) {
00144     if (error_ == &empty_string) error_ = new string(compile_error);
00145   }
00146   return re;
00147 }
00148 
00149 /***** Matching interfaces *****/
00150 
00151 bool RE::FullMatch(const StringPiece& text,
00152                    const Arg& ptr1,
00153                    const Arg& ptr2,
00154                    const Arg& ptr3,
00155                    const Arg& ptr4,
00156                    const Arg& ptr5,
00157                    const Arg& ptr6,
00158                    const Arg& ptr7,
00159                    const Arg& ptr8,
00160                    const Arg& ptr9,
00161                    const Arg& ptr10,
00162                    const Arg& ptr11,
00163                    const Arg& ptr12,
00164                    const Arg& ptr13,
00165                    const Arg& ptr14,
00166                    const Arg& ptr15,
00167                    const Arg& ptr16) const {
00168   const Arg* args[kMaxArgs];
00169   int n = 0;
00170   if (&ptr1  == &no_arg) goto done; args[n++] = &ptr1;
00171   if (&ptr2  == &no_arg) goto done; args[n++] = &ptr2;
00172   if (&ptr3  == &no_arg) goto done; args[n++] = &ptr3;
00173   if (&ptr4  == &no_arg) goto done; args[n++] = &ptr4;
00174   if (&ptr5  == &no_arg) goto done; args[n++] = &ptr5;
00175   if (&ptr6  == &no_arg) goto done; args[n++] = &ptr6;
00176   if (&ptr7  == &no_arg) goto done; args[n++] = &ptr7;
00177   if (&ptr8  == &no_arg) goto done; args[n++] = &ptr8;
00178   if (&ptr9  == &no_arg) goto done; args[n++] = &ptr9;
00179   if (&ptr10 == &no_arg) goto done; args[n++] = &ptr10;
00180   if (&ptr11 == &no_arg) goto done; args[n++] = &ptr11;
00181   if (&ptr12 == &no_arg) goto done; args[n++] = &ptr12;
00182   if (&ptr13 == &no_arg) goto done; args[n++] = &ptr13;
00183   if (&ptr14 == &no_arg) goto done; args[n++] = &ptr14;
00184   if (&ptr15 == &no_arg) goto done; args[n++] = &ptr15;
00185   if (&ptr16 == &no_arg) goto done; args[n++] = &ptr16;
00186  done:
00187 
00188   int consumed;
00189   int vec[kVecSize];
00190   return DoMatchImpl(text, ANCHOR_BOTH, &consumed, args, n, vec, kVecSize);
00191 }
00192 
00193 bool RE::PartialMatch(const StringPiece& text,
00194                       const Arg& ptr1,
00195                       const Arg& ptr2,
00196                       const Arg& ptr3,
00197                       const Arg& ptr4,
00198                       const Arg& ptr5,
00199                       const Arg& ptr6,
00200                       const Arg& ptr7,
00201                       const Arg& ptr8,
00202                       const Arg& ptr9,
00203                       const Arg& ptr10,
00204                       const Arg& ptr11,
00205                       const Arg& ptr12,
00206                       const Arg& ptr13,
00207                       const Arg& ptr14,
00208                       const Arg& ptr15,
00209                       const Arg& ptr16) const {
00210   const Arg* args[kMaxArgs];
00211   int n = 0;
00212   if (&ptr1  == &no_arg) goto done; args[n++] = &ptr1;
00213   if (&ptr2  == &no_arg) goto done; args[n++] = &ptr2;
00214   if (&ptr3  == &no_arg) goto done; args[n++] = &ptr3;
00215   if (&ptr4  == &no_arg) goto done; args[n++] = &ptr4;
00216   if (&ptr5  == &no_arg) goto done; args[n++] = &ptr5;
00217   if (&ptr6  == &no_arg) goto done; args[n++] = &ptr6;
00218   if (&ptr7  == &no_arg) goto done; args[n++] = &ptr7;
00219   if (&ptr8  == &no_arg) goto done; args[n++] = &ptr8;
00220   if (&ptr9  == &no_arg) goto done; args[n++] = &ptr9;
00221   if (&ptr10 == &no_arg) goto done; args[n++] = &ptr10;
00222   if (&ptr11 == &no_arg) goto done; args[n++] = &ptr11;
00223   if (&ptr12 == &no_arg) goto done; args[n++] = &ptr12;
00224   if (&ptr13 == &no_arg) goto done; args[n++] = &ptr13;
00225   if (&ptr14 == &no_arg) goto done; args[n++] = &ptr14;
00226   if (&ptr15 == &no_arg) goto done; args[n++] = &ptr15;
00227   if (&ptr16 == &no_arg) goto done; args[n++] = &ptr16;
00228  done:
00229 
00230   int consumed;
00231   int vec[kVecSize];
00232   return DoMatchImpl(text, UNANCHORED, &consumed, args, n, vec, kVecSize);
00233 }
00234 
00235 bool RE::Consume(StringPiece* input,
00236                  const Arg& ptr1,
00237                  const Arg& ptr2,
00238                  const Arg& ptr3,
00239                  const Arg& ptr4,
00240                  const Arg& ptr5,
00241                  const Arg& ptr6,
00242                  const Arg& ptr7,
00243                  const Arg& ptr8,
00244                  const Arg& ptr9,
00245                  const Arg& ptr10,
00246                  const Arg& ptr11,
00247                  const Arg& ptr12,
00248                  const Arg& ptr13,
00249                  const Arg& ptr14,
00250                  const Arg& ptr15,
00251                  const Arg& ptr16) const {
00252   const Arg* args[kMaxArgs];
00253   int n = 0;
00254   if (&ptr1  == &no_arg) goto done; args[n++] = &ptr1;
00255   if (&ptr2  == &no_arg) goto done; args[n++] = &ptr2;
00256   if (&ptr3  == &no_arg) goto done; args[n++] = &ptr3;
00257   if (&ptr4  == &no_arg) goto done; args[n++] = &ptr4;
00258   if (&ptr5  == &no_arg) goto done; args[n++] = &ptr5;
00259   if (&ptr6  == &no_arg) goto done; args[n++] = &ptr6;
00260   if (&ptr7  == &no_arg) goto done; args[n++] = &ptr7;
00261   if (&ptr8  == &no_arg) goto done; args[n++] = &ptr8;
00262   if (&ptr9  == &no_arg) goto done; args[n++] = &ptr9;
00263   if (&ptr10 == &no_arg) goto done; args[n++] = &ptr10;
00264   if (&ptr11 == &no_arg) goto done; args[n++] = &ptr11;
00265   if (&ptr12 == &no_arg) goto done; args[n++] = &ptr12;
00266   if (&ptr13 == &no_arg) goto done; args[n++] = &ptr13;
00267   if (&ptr14 == &no_arg) goto done; args[n++] = &ptr14;
00268   if (&ptr15 == &no_arg) goto done; args[n++] = &ptr15;
00269   if (&ptr16 == &no_arg) goto done; args[n++] = &ptr16;
00270  done:
00271 
00272   int consumed;
00273   int vec[kVecSize];
00274   if (DoMatchImpl(*input, ANCHOR_START, &consumed,
00275                   args, n, vec, kVecSize)) {
00276     input->remove_prefix(consumed);
00277     return true;
00278   } else {
00279     return false;
00280   }
00281 }
00282 
00283 bool RE::FindAndConsume(StringPiece* input,
00284                         const Arg& ptr1,
00285                         const Arg& ptr2,
00286                         const Arg& ptr3,
00287                         const Arg& ptr4,
00288                         const Arg& ptr5,
00289                         const Arg& ptr6,
00290                         const Arg& ptr7,
00291                         const Arg& ptr8,
00292                         const Arg& ptr9,
00293                         const Arg& ptr10,
00294                         const Arg& ptr11,
00295                         const Arg& ptr12,
00296                         const Arg& ptr13,
00297                         const Arg& ptr14,
00298                         const Arg& ptr15,
00299                         const Arg& ptr16) const {
00300   const Arg* args[kMaxArgs];
00301   int n = 0;
00302   if (&ptr1  == &no_arg) goto done; args[n++] = &ptr1;
00303   if (&ptr2  == &no_arg) goto done; args[n++] = &ptr2;
00304   if (&ptr3  == &no_arg) goto done; args[n++] = &ptr3;
00305   if (&ptr4  == &no_arg) goto done; args[n++] = &ptr4;
00306   if (&ptr5  == &no_arg) goto done; args[n++] = &ptr5;
00307   if (&ptr6  == &no_arg) goto done; args[n++] = &ptr6;
00308   if (&ptr7  == &no_arg) goto done; args[n++] = &ptr7;
00309   if (&ptr8  == &no_arg) goto done; args[n++] = &ptr8;
00310   if (&ptr9  == &no_arg) goto done; args[n++] = &ptr9;
00311   if (&ptr10 == &no_arg) goto done; args[n++] = &ptr10;
00312   if (&ptr11 == &no_arg) goto done; args[n++] = &ptr11;
00313   if (&ptr12 == &no_arg) goto done; args[n++] = &ptr12;
00314   if (&ptr13 == &no_arg) goto done; args[n++] = &ptr13;
00315   if (&ptr14 == &no_arg) goto done; args[n++] = &ptr14;
00316   if (&ptr15 == &no_arg) goto done; args[n++] = &ptr15;
00317   if (&ptr16 == &no_arg) goto done; args[n++] = &ptr16;
00318  done:
00319 
00320   int consumed;
00321   int vec[kVecSize];
00322   if (DoMatchImpl(*input, UNANCHORED, &consumed,
00323                   args, n, vec, kVecSize)) {
00324     input->remove_prefix(consumed);
00325     return true;
00326   } else {
00327     return false;
00328   }
00329 }
00330 
00331 bool RE::Replace(const StringPiece& rewrite,
00332                  string *str) const {
00333   int vec[kVecSize];
00334   int matches = TryMatch(*str, 0, UNANCHORED, vec, kVecSize);
00335   if (matches == 0)
00336     return false;
00337 
00338   string s;
00339   if (!Rewrite(&s, rewrite, *str, vec, matches))
00340     return false;
00341 
00342   assert(vec[0] >= 0);
00343   assert(vec[1] >= 0);
00344   str->replace(vec[0], vec[1] - vec[0], s);
00345   return true;
00346 }
00347 
00348 // Returns PCRE_NEWLINE_CRLF, PCRE_NEWLINE_CR, or PCRE_NEWLINE_LF.
00349 // Note that PCRE_NEWLINE_CRLF is defined to be P_N_CR | P_N_LF.
00350 // Modified by PH to add PCRE_NEWLINE_ANY and PCRE_NEWLINE_ANYCRLF.
00351 
00352 static int NewlineMode(int pcre_options) {
00353   // TODO: if we can make it threadsafe, cache this var
00354   int newline_mode = 0;
00355   /* if (newline_mode) return newline_mode; */  // do this once it's cached
00356   if (pcre_options & (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|
00357                       PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF)) {
00358     newline_mode = (pcre_options &
00359                     (PCRE_NEWLINE_CRLF|PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|
00360                      PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF));
00361   } else {
00362     int newline;
00363     pcre_config(PCRE_CONFIG_NEWLINE, &newline);
00364     if (newline == 10)
00365       newline_mode = PCRE_NEWLINE_LF;
00366     else if (newline == 13)
00367       newline_mode = PCRE_NEWLINE_CR;
00368     else if (newline == 3338)
00369       newline_mode = PCRE_NEWLINE_CRLF;
00370     else if (newline == -1)
00371       newline_mode = PCRE_NEWLINE_ANY;
00372     else if (newline == -2)
00373       newline_mode = PCRE_NEWLINE_ANYCRLF;
00374     else
00375       assert(NULL == "Unexpected return value from pcre_config(NEWLINE)");
00376   }
00377   return newline_mode;
00378 }
00379 
00380 int RE::GlobalReplace(const StringPiece& rewrite,
00381                       string *str) const {
00382   int count = 0;
00383   int vec[kVecSize];
00384   string out;
00385   int start = 0;
00386   int lastend = -1;
00387 
00388   while (start <= static_cast<int>(str->length())) {
00389     int matches = TryMatch(*str, start, UNANCHORED, vec, kVecSize);
00390     if (matches <= 0)
00391       break;
00392     int matchstart = vec[0], matchend = vec[1];
00393     assert(matchstart >= start);
00394     assert(matchend >= matchstart);
00395     if (matchstart == matchend && matchstart == lastend) {
00396       // advance one character if we matched an empty string at the same
00397       // place as the last match occurred
00398       matchend = start + 1;
00399       // If the current char is CR and we're in CRLF mode, skip LF too.
00400       // Note it's better to call pcre_fullinfo() than to examine
00401       // all_options(), since options_ could have changed bewteen
00402       // compile-time and now, but this is simpler and safe enough.
00403       // Modified by PH to add ANY and ANYCRLF.
00404       if (start+1 < static_cast<int>(str->length()) &&
00405           (*str)[start] == '\r' && (*str)[start+1] == '\n' &&
00406           (NewlineMode(options_.all_options()) == PCRE_NEWLINE_CRLF ||
00407            NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANY ||
00408            NewlineMode(options_.all_options()) == PCRE_NEWLINE_ANYCRLF)
00409           ) {
00410         matchend++;
00411       }
00412       // We also need to advance more than one char if we're in utf8 mode.
00413 #ifdef SUPPORT_UTF8
00414       if (options_.utf8()) {
00415         while (matchend < static_cast<int>(str->length()) &&
00416                ((*str)[matchend] & 0xc0) == 0x80)
00417           matchend++;
00418       }
00419 #endif
00420       if (matchend <= static_cast<int>(str->length()))
00421         out.append(*str, start, matchend - start);
00422       start = matchend;
00423     } else {
00424       out.append(*str, start, matchstart - start);
00425       Rewrite(&out, rewrite, *str, vec, matches);
00426       start = matchend;
00427       lastend = matchend;
00428       count++;
00429     }
00430   }
00431 
00432   if (count == 0)
00433     return 0;
00434 
00435   if (start < static_cast<int>(str->length()))
00436     out.append(*str, start, str->length() - start);
00437   swap(out, *str);
00438   return count;
00439 }
00440 
00441 bool RE::Extract(const StringPiece& rewrite,
00442                  const StringPiece& text,
00443                  string *out) const {
00444   int vec[kVecSize];
00445   int matches = TryMatch(text, 0, UNANCHORED, vec, kVecSize);
00446   if (matches == 0)
00447     return false;
00448   out->erase();
00449   return Rewrite(out, rewrite, text, vec, matches);
00450 }
00451 
00452 /*static*/ string RE::QuoteMeta(const StringPiece& unquoted) {
00453   string result;
00454 
00455   // Escape any ascii character not in [A-Za-z_0-9].
00456   //
00457   // Note that it's legal to escape a character even if it has no
00458   // special meaning in a regular expression -- so this function does
00459   // that.  (This also makes it identical to the perl function of the
00460   // same name; see `perldoc -f quotemeta`.)  The one exception is
00461   // escaping NUL: rather than doing backslash + NUL, like perl does,
00462   // we do '\0', because pcre itself doesn't take embedded NUL chars.
00463   for (int ii = 0; ii < unquoted.size(); ++ii) {
00464     // Note that using 'isalnum' here raises the benchmark time from
00465     // 32ns to 58ns:
00466     if (unquoted[ii] == '\0') {
00467       result += "\\0";
00468     } else if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
00469                (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
00470                (unquoted[ii] < '0' || unquoted[ii] > '9') &&
00471                unquoted[ii] != '_' &&
00472                // If this is the part of a UTF8 or Latin1 character, we need
00473                // to copy this byte without escaping.  Experimentally this is
00474                // what works correctly with the regexp library.
00475                !(unquoted[ii] & 128)) {
00476       result += '\\';
00477       result += unquoted[ii];
00478     } else {
00479       result += unquoted[ii];
00480     }
00481   }
00482 
00483   return result;
00484 }
00485 
00486 /***** Actual matching and rewriting code *****/
00487 
00488 int RE::TryMatch(const StringPiece& text,
00489                  int startpos,
00490                  Anchor anchor,
00491                  int *vec,
00492                  int vecsize) const {
00493   pcre* re = (anchor == ANCHOR_BOTH) ? re_full_ : re_partial_;
00494   if (re == NULL) {
00495     //fprintf(stderr, "Matching against invalid re: %s\n", error_->c_str());
00496     return 0;
00497   }
00498 
00499   pcre_extra extra = { 0, 0, 0, 0, 0, 0 };
00500   if (options_.match_limit() > 0) {
00501     extra.flags |= PCRE_EXTRA_MATCH_LIMIT;
00502     extra.match_limit = options_.match_limit();
00503   }
00504   if (options_.match_limit_recursion() > 0) {
00505     extra.flags |= PCRE_EXTRA_MATCH_LIMIT_RECURSION;
00506     extra.match_limit_recursion = options_.match_limit_recursion();
00507   }
00508   int rc = pcre_exec(re,              // The regular expression object
00509                      &extra,
00510                      (text.data() == NULL) ? "" : text.data(),
00511                      text.size(),
00512                      startpos,
00513                      (anchor == UNANCHORED) ? 0 : PCRE_ANCHORED,
00514                      vec,
00515                      vecsize);
00516 
00517   // Handle errors
00518   if (rc == PCRE_ERROR_NOMATCH) {
00519     return 0;
00520   } else if (rc < 0) {
00521     //fprintf(stderr, "Unexpected return code: %d when matching '%s'\n",
00522     //        re, pattern_.c_str());
00523     return 0;
00524   } else if (rc == 0) {
00525     // pcre_exec() returns 0 as a special case when the number of
00526     // capturing subpatterns exceeds the size of the vector.
00527     // When this happens, there is a match and the output vector
00528     // is filled, but we miss out on the positions of the extra subpatterns.
00529     rc = vecsize / 2;
00530   }
00531 
00532   return rc;
00533 }
00534 
00535 bool RE::DoMatchImpl(const StringPiece& text,
00536                      Anchor anchor,
00537                      int* consumed,
00538                      const Arg* const* args,
00539                      int n,
00540                      int* vec,
00541                      int vecsize) const {
00542   assert((1 + n) * 3 <= vecsize);  // results + PCRE workspace
00543   int matches = TryMatch(text, 0, anchor, vec, vecsize);
00544   assert(matches >= 0);  // TryMatch never returns negatives
00545   if (matches == 0)
00546     return false;
00547 
00548   *consumed = vec[1];
00549 
00550   if (n == 0 || args == NULL) {
00551     // We are not interested in results
00552     return true;
00553   }
00554 
00555   if (NumberOfCapturingGroups() < n) {
00556     // RE has fewer capturing groups than number of arg pointers passed in
00557     return false;
00558   }
00559 
00560   // If we got here, we must have matched the whole pattern.
00561   // We do not need (can not do) any more checks on the value of 'matches' here
00562   // -- see the comment for TryMatch.
00563   for (int i = 0; i < n; i++) {
00564     const int start = vec[2*(i+1)];
00565     const int limit = vec[2*(i+1)+1];
00566     if (!args[i]->Parse(text.data() + start, limit-start)) {
00567       // TODO: Should we indicate what the error was?
00568       return false;
00569     }
00570   }
00571 
00572   return true;
00573 }
00574 
00575 bool RE::DoMatch(const StringPiece& text,
00576                  Anchor anchor,
00577                  int* consumed,
00578                  const Arg* const args[],
00579                  int n) const {
00580   assert(n >= 0);
00581   size_t const vecsize = (1 + n) * 3;  // results + PCRE workspace
00582                                        // (as for kVecSize)
00583   int space[21];   // use stack allocation for small vecsize (common case)
00584   int* vec = vecsize <= 21 ? space : new int[vecsize];
00585   bool retval = DoMatchImpl(text, anchor, consumed, args, n, vec, vecsize);
00586   if (vec != space) delete [] vec;
00587   return retval;
00588 }
00589 
00590 bool RE::Rewrite(string *out, const StringPiece &rewrite,
00591                  const StringPiece &text, int *vec, int veclen) const {
00592   for (const char *s = rewrite.data(), *end = s + rewrite.size();
00593        s < end; s++) {
00594     int c = *s;
00595     if (c == '\\') {
00596       c = *++s;
00597       if (isdigit(c)) {
00598         int n = (c - '0');
00599         if (n >= veclen) {
00600           //fprintf(stderr, requested group %d in regexp %.*s\n",
00601           //        n, rewrite.size(), rewrite.data());
00602           return false;
00603         }
00604         int start = vec[2 * n];
00605         if (start >= 0)
00606           out->append(text.data() + start, vec[2 * n + 1] - start);
00607       } else if (c == '\\') {
00608         *out += '\\';
00609       } else {
00610         //fprintf(stderr, "invalid rewrite pattern: %.*s\n",
00611         //        rewrite.size(), rewrite.data());
00612         return false;
00613       }
00614     } else {
00615       *out += c;
00616     }
00617   }
00618   return true;
00619 }
00620 
00621 // Return the number of capturing subpatterns, or -1 if the
00622 // regexp wasn't valid on construction.
00623 int RE::NumberOfCapturingGroups() const {
00624   if (re_partial_ == NULL) return -1;
00625 
00626   int result;
00627   int pcre_retval = pcre_fullinfo(re_partial_,  // The regular expression object
00628                                   NULL,         // We did not study the pattern
00629                                   PCRE_INFO_CAPTURECOUNT,
00630                                   &result);
00631   assert(pcre_retval == 0);
00632   return result;
00633 }
00634 
00635 /***** Parsers for various types *****/
00636 
00637 bool Arg::parse_null(const char* str, int n, void* dest) {
00638   // We fail if somebody asked us to store into a non-NULL void* pointer
00639   return (dest == NULL);
00640 }
00641 
00642 bool Arg::parse_string(const char* str, int n, void* dest) {
00643   if (dest == NULL) return true;
00644   reinterpret_cast<string*>(dest)->assign(str, n);
00645   return true;
00646 }
00647 
00648 bool Arg::parse_stringpiece(const char* str, int n, void* dest) {
00649   if (dest == NULL) return true;
00650   reinterpret_cast<StringPiece*>(dest)->set(str, n);
00651   return true;
00652 }
00653 
00654 bool Arg::parse_char(const char* str, int n, void* dest) {
00655   if (n != 1) return false;
00656   if (dest == NULL) return true;
00657   *(reinterpret_cast<char*>(dest)) = str[0];
00658   return true;
00659 }
00660 
00661 bool Arg::parse_uchar(const char* str, int n, void* dest) {
00662   if (n != 1) return false;
00663   if (dest == NULL) return true;
00664   *(reinterpret_cast<unsigned char*>(dest)) = str[0];
00665   return true;
00666 }
00667 
00668 // Largest number spec that we are willing to parse
00669 static const int kMaxNumberLength = 32;
00670 
00671 // REQUIRES "buf" must have length at least kMaxNumberLength+1
00672 // REQUIRES "n > 0"
00673 // Copies "str" into "buf" and null-terminates if necessary.
00674 // Returns one of:
00675 //      a. "str" if no termination is needed
00676 //      b. "buf" if the string was copied and null-terminated
00677 //      c. "" if the input was invalid and has no hope of being parsed
00678 static const char* TerminateNumber(char* buf, const char* str, int n) {
00679   if ((n > 0) && isspace(*str)) {
00680     // We are less forgiving than the strtoxxx() routines and do not
00681     // allow leading spaces.
00682     return "";
00683   }
00684 
00685   // See if the character right after the input text may potentially
00686   // look like a digit.
00687   if (isdigit(str[n]) ||
00688       ((str[n] >= 'a') && (str[n] <= 'f')) ||
00689       ((str[n] >= 'A') && (str[n] <= 'F'))) {
00690     if (n > kMaxNumberLength) return ""; // Input too big to be a valid number
00691     memcpy(buf, str, n);
00692     buf[n] = '\0';
00693     return buf;
00694   } else {
00695     // We can parse right out of the supplied string, so return it.
00696     return str;
00697   }
00698 }
00699 
00700 bool Arg::parse_long_radix(const char* str,
00701                            int n,
00702                            void* dest,
00703                            int radix) {
00704   if (n == 0) return false;
00705   char buf[kMaxNumberLength+1];
00706   str = TerminateNumber(buf, str, n);
00707   char* end;
00708   errno = 0;
00709   long r = strtol(str, &end, radix);
00710   if (end != str + n) return false;   // Leftover junk
00711   if (errno) return false;
00712   if (dest == NULL) return true;
00713   *(reinterpret_cast<long*>(dest)) = r;
00714   return true;
00715 }
00716 
00717 bool Arg::parse_ulong_radix(const char* str,
00718                             int n,
00719                             void* dest,
00720                             int radix) {
00721   if (n == 0) return false;
00722   char buf[kMaxNumberLength+1];
00723   str = TerminateNumber(buf, str, n);
00724   if (str[0] == '-') return false;    // strtoul() on a negative number?!
00725   char* end;
00726   errno = 0;
00727   unsigned long r = strtoul(str, &end, radix);
00728   if (end != str + n) return false;   // Leftover junk
00729   if (errno) return false;
00730   if (dest == NULL) return true;
00731   *(reinterpret_cast<unsigned long*>(dest)) = r;
00732   return true;
00733 }
00734 
00735 bool Arg::parse_short_radix(const char* str,
00736                             int n,
00737                             void* dest,
00738                             int radix) {
00739   long r;
00740   if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
00741   if (r < SHRT_MIN || r > SHRT_MAX) return false;       // Out of range
00742   if (dest == NULL) return true;
00743   *(reinterpret_cast<short*>(dest)) = static_cast<short>(r);
00744   return true;
00745 }
00746 
00747 bool Arg::parse_ushort_radix(const char* str,
00748                              int n,
00749                              void* dest,
00750                              int radix) {
00751   unsigned long r;
00752   if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
00753   if (r > USHRT_MAX) return false;                      // Out of range
00754   if (dest == NULL) return true;
00755   *(reinterpret_cast<unsigned short*>(dest)) = static_cast<unsigned short>(r);
00756   return true;
00757 }
00758 
00759 bool Arg::parse_int_radix(const char* str,
00760                           int n,
00761                           void* dest,
00762                           int radix) {
00763   long r;
00764   if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
00765   if (r < INT_MIN || r > INT_MAX) return false;         // Out of range
00766   if (dest == NULL) return true;
00767   *(reinterpret_cast<int*>(dest)) = r;
00768   return true;
00769 }
00770 
00771 bool Arg::parse_uint_radix(const char* str,
00772                            int n,
00773                            void* dest,
00774                            int radix) {
00775   unsigned long r;
00776   if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
00777   if (r > UINT_MAX) return false;                       // Out of range
00778   if (dest == NULL) return true;
00779   *(reinterpret_cast<unsigned int*>(dest)) = r;
00780   return true;
00781 }
00782 
00783 bool Arg::parse_longlong_radix(const char* str,
00784                                int n,
00785                                void* dest,
00786                                int radix) {
00787 #ifndef HAVE_LONG_LONG
00788   return false;
00789 #else
00790   if (n == 0) return false;
00791   char buf[kMaxNumberLength+1];
00792   str = TerminateNumber(buf, str, n);
00793   char* end;
00794   errno = 0;
00795 #if defined HAVE_STRTOQ
00796   long long r = strtoq(str, &end, radix);
00797 #elif defined HAVE_STRTOLL
00798   long long r = strtoll(str, &end, radix);
00799 #elif defined HAVE__STRTOI64
00800   long long r = _strtoi64(str, &end, radix);
00801 #else
00802 #error parse_longlong_radix: cannot convert input to a long-long
00803 #endif
00804   if (end != str + n) return false;   // Leftover junk
00805   if (errno) return false;
00806   if (dest == NULL) return true;
00807   *(reinterpret_cast<long long*>(dest)) = r;
00808   return true;
00809 #endif   /* HAVE_LONG_LONG */
00810 }
00811 
00812 bool Arg::parse_ulonglong_radix(const char* str,
00813                                 int n,
00814                                 void* dest,
00815                                 int radix) {
00816 #ifndef HAVE_UNSIGNED_LONG_LONG
00817   return false;
00818 #else
00819   if (n == 0) return false;
00820   char buf[kMaxNumberLength+1];
00821   str = TerminateNumber(buf, str, n);
00822   if (str[0] == '-') return false;    // strtoull() on a negative number?!
00823   char* end;
00824   errno = 0;
00825 #if defined HAVE_STRTOQ
00826   unsigned long long r = strtouq(str, &end, radix);
00827 #elif defined HAVE_STRTOLL
00828   unsigned long long r = strtoull(str, &end, radix);
00829 #elif defined HAVE__STRTOI64
00830   unsigned long long r = _strtoui64(str, &end, radix);
00831 #else
00832 #error parse_ulonglong_radix: cannot convert input to a long-long
00833 #endif
00834   if (end != str + n) return false;   // Leftover junk
00835   if (errno) return false;
00836   if (dest == NULL) return true;
00837   *(reinterpret_cast<unsigned long long*>(dest)) = r;
00838   return true;
00839 #endif   /* HAVE_UNSIGNED_LONG_LONG */
00840 }
00841 
00842 bool Arg::parse_double(const char* str, int n, void* dest) {
00843   if (n == 0) return false;
00844   static const int kMaxLength = 200;
00845   char buf[kMaxLength];
00846   if (n >= kMaxLength) return false;
00847   memcpy(buf, str, n);
00848   buf[n] = '\0';
00849   errno = 0;
00850   char* end;
00851   double r = strtod(buf, &end);
00852   if (end != buf + n) return false;   // Leftover junk
00853   if (errno) return false;
00854   if (dest == NULL) return true;
00855   *(reinterpret_cast<double*>(dest)) = r;
00856   return true;
00857 }
00858 
00859 bool Arg::parse_float(const char* str, int n, void* dest) {
00860   double r;
00861   if (!parse_double(str, n, &r)) return false;
00862   if (dest == NULL) return true;
00863   *(reinterpret_cast<float*>(dest)) = static_cast<float>(r);
00864   return true;
00865 }
00866 
00867 
00868 #define DEFINE_INTEGER_PARSERS(name)                                    \
00869   bool Arg::parse_##name(const char* str, int n, void* dest) {          \
00870     return parse_##name##_radix(str, n, dest, 10);                      \
00871   }                                                                     \
00872   bool Arg::parse_##name##_hex(const char* str, int n, void* dest) {    \
00873     return parse_##name##_radix(str, n, dest, 16);                      \
00874   }                                                                     \
00875   bool Arg::parse_##name##_octal(const char* str, int n, void* dest) {  \
00876     return parse_##name##_radix(str, n, dest, 8);                       \
00877   }                                                                     \
00878   bool Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \
00879     return parse_##name##_radix(str, n, dest, 0);                       \
00880   }
00881 
00882 DEFINE_INTEGER_PARSERS(short)      /*                                   */
00883 DEFINE_INTEGER_PARSERS(ushort)     /*                                   */
00884 DEFINE_INTEGER_PARSERS(int)        /* Don't use semicolons after these  */
00885 DEFINE_INTEGER_PARSERS(uint)       /* statements because they can cause */
00886 DEFINE_INTEGER_PARSERS(long)       /* compiler warnings if the checking */
00887 DEFINE_INTEGER_PARSERS(ulong)      /* level is turned up high enough.   */
00888 DEFINE_INTEGER_PARSERS(longlong)   /*                                   */
00889 DEFINE_INTEGER_PARSERS(ulonglong)  /*                                   */
00890 
00891 #undef DEFINE_INTEGER_PARSERS
00892 
00893 }   // namespace pcrecpp

Generated on Tue Jul 5 14:11:58 2011 for ROOT_528-00b_version by  doxygen 1.5.1