pcreposix.c

Go to the documentation of this file.
00001 /*************************************************
00002 *      Perl-Compatible Regular Expressions       *
00003 *************************************************/
00004 
00005 /* PCRE is a library of functions to support regular expressions whose syntax
00006 and semantics are as close as possible to those of the Perl 5 language.
00007 
00008                        Written by Philip Hazel
00009            Copyright (c) 1997-2008 University of Cambridge
00010 
00011 -----------------------------------------------------------------------------
00012 Redistribution and use in source and binary forms, with or without
00013 modification, are permitted provided that the following conditions are met:
00014 
00015     * Redistributions of source code must retain the above copyright notice,
00016       this list of conditions and the following disclaimer.
00017 
00018     * Redistributions in binary form must reproduce the above copyright
00019       notice, this list of conditions and the following disclaimer in the
00020       documentation and/or other materials provided with the distribution.
00021 
00022     * Neither the name of the University of Cambridge nor the names of its
00023       contributors may be used to endorse or promote products derived from
00024       this software without specific prior written permission.
00025 
00026 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
00027 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00028 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00029 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
00030 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
00031 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
00032 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
00033 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
00034 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
00035 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00036 POSSIBILITY OF SUCH DAMAGE.
00037 -----------------------------------------------------------------------------
00038 */
00039 
00040 
00041 /* This module is a wrapper that provides a POSIX API to the underlying PCRE
00042 functions. */
00043 
00044 
00045 #ifdef HAVE_CONFIG_H
00046 #include "config.h"
00047 #endif
00048 
00049 
00050 /* Ensure that the PCREPOSIX_EXP_xxx macros are set appropriately for
00051 compiling these functions. This must come before including pcreposix.h, where
00052 they are set for an application (using these functions) if they have not
00053 previously been set. */
00054 
00055 #if defined(_WIN32) && !defined(PCRE_STATIC)
00056 #  define PCREPOSIX_EXP_DECL extern __declspec(dllexport)
00057 #  define PCREPOSIX_EXP_DEFN __declspec(dllexport)
00058 #endif
00059 
00060 #include "pcre.h"
00061 #include "pcre_internal.h"
00062 #include "pcreposix.h"
00063 
00064 
00065 /* Table to translate PCRE compile time error codes into POSIX error codes. */
00066 
00067 static const int eint[] = {
00068   0,           /* no error */
00069   REG_EESCAPE, /* \ at end of pattern */
00070   REG_EESCAPE, /* \c at end of pattern */
00071   REG_EESCAPE, /* unrecognized character follows \ */
00072   REG_BADBR,   /* numbers out of order in {} quantifier */
00073   REG_BADBR,   /* number too big in {} quantifier */
00074   REG_EBRACK,  /* missing terminating ] for character class */
00075   REG_ECTYPE,  /* invalid escape sequence in character class */
00076   REG_ERANGE,  /* range out of order in character class */
00077   REG_BADRPT,  /* nothing to repeat */
00078   REG_BADRPT,  /* operand of unlimited repeat could match the empty string */
00079   REG_ASSERT,  /* internal error: unexpected repeat */
00080   REG_BADPAT,  /* unrecognized character after (? */
00081   REG_BADPAT,  /* POSIX named classes are supported only within a class */
00082   REG_EPAREN,  /* missing ) */
00083   REG_ESUBREG, /* reference to non-existent subpattern */
00084   REG_INVARG,  /* erroffset passed as NULL */
00085   REG_INVARG,  /* unknown option bit(s) set */
00086   REG_EPAREN,  /* missing ) after comment */
00087   REG_ESIZE,   /* parentheses nested too deeply */
00088   REG_ESIZE,   /* regular expression too large */
00089   REG_ESPACE,  /* failed to get memory */
00090   REG_EPAREN,  /* unmatched brackets */
00091   REG_ASSERT,  /* internal error: code overflow */
00092   REG_BADPAT,  /* unrecognized character after (?< */
00093   REG_BADPAT,  /* lookbehind assertion is not fixed length */
00094   REG_BADPAT,  /* malformed number or name after (?( */
00095   REG_BADPAT,  /* conditional group contains more than two branches */
00096   REG_BADPAT,  /* assertion expected after (?( */
00097   REG_BADPAT,  /* (?R or (?[+-]digits must be followed by ) */
00098   REG_ECTYPE,  /* unknown POSIX class name */
00099   REG_BADPAT,  /* POSIX collating elements are not supported */
00100   REG_INVARG,  /* this version of PCRE is not compiled with PCRE_UTF8 support */
00101   REG_BADPAT,  /* spare error */
00102   REG_BADPAT,  /* character value in \x{...} sequence is too large */
00103   REG_BADPAT,  /* invalid condition (?(0) */
00104   REG_BADPAT,  /* \C not allowed in lookbehind assertion */
00105   REG_EESCAPE, /* PCRE does not support \L, \l, \N, \U, or \u */
00106   REG_BADPAT,  /* number after (?C is > 255 */
00107   REG_BADPAT,  /* closing ) for (?C expected */
00108   REG_BADPAT,  /* recursive call could loop indefinitely */
00109   REG_BADPAT,  /* unrecognized character after (?P */
00110   REG_BADPAT,  /* syntax error in subpattern name (missing terminator) */
00111   REG_BADPAT,  /* two named subpatterns have the same name */
00112   REG_BADPAT,  /* invalid UTF-8 string */
00113   REG_BADPAT,  /* support for \P, \p, and \X has not been compiled */
00114   REG_BADPAT,  /* malformed \P or \p sequence */
00115   REG_BADPAT,  /* unknown property name after \P or \p */
00116   REG_BADPAT,  /* subpattern name is too long (maximum 32 characters) */
00117   REG_BADPAT,  /* too many named subpatterns (maximum 10,000) */
00118   REG_BADPAT,  /* repeated subpattern is too long */
00119   REG_BADPAT,  /* octal value is greater than \377 (not in UTF-8 mode) */
00120   REG_BADPAT,  /* internal error: overran compiling workspace */
00121   REG_BADPAT,  /* internal error: previously-checked referenced subpattern not found */
00122   REG_BADPAT,  /* DEFINE group contains more than one branch */
00123   REG_BADPAT,  /* repeating a DEFINE group is not allowed */
00124   REG_INVARG,  /* inconsistent NEWLINE options */
00125   REG_BADPAT,  /* \g is not followed followed by an (optionally braced) non-zero number */
00126   REG_BADPAT,  /* (?+ or (?- must be followed by a non-zero number */
00127   REG_BADPAT,  /* number is too big */
00128   REG_BADPAT,  /* subpattern name expected */
00129   REG_BADPAT,  /* digit expected after (?+ */
00130   REG_BADPAT   /* ] is an invalid data character in JavaScript compatibility mode */
00131 };
00132 
00133 /* Table of texts corresponding to POSIX error codes */
00134 
00135 static const char *const pstring[] = {
00136   "",                                /* Dummy for value 0 */
00137   "internal error",                  /* REG_ASSERT */
00138   "invalid repeat counts in {}",     /* BADBR      */
00139   "pattern error",                   /* BADPAT     */
00140   "? * + invalid",                   /* BADRPT     */
00141   "unbalanced {}",                   /* EBRACE     */
00142   "unbalanced []",                   /* EBRACK     */
00143   "collation error - not relevant",  /* ECOLLATE   */
00144   "bad class",                       /* ECTYPE     */
00145   "bad escape sequence",             /* EESCAPE    */
00146   "empty expression",                /* EMPTY      */
00147   "unbalanced ()",                   /* EPAREN     */
00148   "bad range inside []",             /* ERANGE     */
00149   "expression too big",              /* ESIZE      */
00150   "failed to get memory",            /* ESPACE     */
00151   "bad back reference",              /* ESUBREG    */
00152   "bad argument",                    /* INVARG     */
00153   "match failed"                     /* NOMATCH    */
00154 };
00155 
00156 
00157 
00158 
00159 /*************************************************
00160 *          Translate error code to string        *
00161 *************************************************/
00162 
00163 PCREPOSIX_EXP_DEFN size_t PCRE_CALL_CONVENTION
00164 regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size)
00165 {
00166 const char *message, *addmessage;
00167 size_t length, addlength;
00168 
00169 message = (errcode >= (int)(sizeof(pstring)/sizeof(char *)))?
00170   "unknown error code" : pstring[errcode];
00171 length = strlen(message) + 1;
00172 
00173 addmessage = " at offset ";
00174 addlength = (preg != NULL && (int)preg->re_erroffset != -1)?
00175   strlen(addmessage) + 6 : 0;
00176 
00177 if (errbuf_size > 0)
00178   {
00179   if (addlength > 0 && errbuf_size >= length + addlength)
00180     sprintf(errbuf, "%s%s%-6d", message, addmessage, (int)preg->re_erroffset);
00181   else
00182     {
00183     strncpy(errbuf, message, errbuf_size - 1);
00184     errbuf[errbuf_size-1] = 0;
00185     }
00186   }
00187 
00188 return length + addlength;
00189 }
00190 
00191 
00192 
00193 
00194 /*************************************************
00195 *           Free store held by a regex           *
00196 *************************************************/
00197 
00198 PCREPOSIX_EXP_DEFN void PCRE_CALL_CONVENTION
00199 regfree(regex_t *preg)
00200 {
00201 (pcre_free)(preg->re_pcre);
00202 }
00203 
00204 
00205 
00206 
00207 /*************************************************
00208 *            Compile a regular expression        *
00209 *************************************************/
00210 
00211 /*
00212 Arguments:
00213   preg        points to a structure for recording the compiled expression
00214   pattern     the pattern to compile
00215   cflags      compilation flags
00216 
00217 Returns:      0 on success
00218               various non-zero codes on failure
00219 */
00220 
00221 PCREPOSIX_EXP_DEFN int PCRE_CALL_CONVENTION
00222 regcomp(regex_t *preg, const char *pattern, int cflags)
00223 {
00224 const char *errorptr;
00225 int erroffset;
00226 int errorcode;
00227 int options = 0;
00228 
00229 if ((cflags & REG_ICASE) != 0)   options |= PCRE_CASELESS;
00230 if ((cflags & REG_NEWLINE) != 0) options |= PCRE_MULTILINE;
00231 if ((cflags & REG_DOTALL) != 0)  options |= PCRE_DOTALL;
00232 if ((cflags & REG_NOSUB) != 0)   options |= PCRE_NO_AUTO_CAPTURE;
00233 if ((cflags & REG_UTF8) != 0)    options |= PCRE_UTF8;
00234 
00235 preg->re_pcre = pcre_compile2(pattern, options, &errorcode, &errorptr,
00236   &erroffset, NULL);
00237 preg->re_erroffset = erroffset;
00238 
00239 if (preg->re_pcre == NULL) return eint[errorcode];
00240 
00241 preg->re_nsub = pcre_info((const pcre *)preg->re_pcre, NULL, NULL);
00242 return 0;
00243 }
00244 
00245 
00246 
00247 
00248 /*************************************************
00249 *              Match a regular expression        *
00250 *************************************************/
00251 
00252 /* Unfortunately, PCRE requires 3 ints of working space for each captured
00253 substring, so we have to get and release working store instead of just using
00254 the POSIX structures as was done in earlier releases when PCRE needed only 2
00255 ints. However, if the number of possible capturing brackets is small, use a
00256 block of store on the stack, to reduce the use of malloc/free. The threshold is
00257 in a macro that can be changed at configure time.
00258 
00259 If REG_NOSUB was specified at compile time, the PCRE_NO_AUTO_CAPTURE flag will
00260 be set. When this is the case, the nmatch and pmatch arguments are ignored, and
00261 the only result is yes/no/error. */
00262 
00263 PCREPOSIX_EXP_DEFN int PCRE_CALL_CONVENTION
00264 regexec(const regex_t *preg, const char *string, size_t nmatch,
00265   regmatch_t pmatch[], int eflags)
00266 {
00267 int rc, so, eo;
00268 int options = 0;
00269 int *ovector = NULL;
00270 int small_ovector[POSIX_MALLOC_THRESHOLD * 3];
00271 BOOL allocated_ovector = FALSE;
00272 BOOL nosub =
00273   (((const pcre *)preg->re_pcre)->options & PCRE_NO_AUTO_CAPTURE) != 0;
00274 
00275 if ((eflags & REG_NOTBOL) != 0) options |= PCRE_NOTBOL;
00276 if ((eflags & REG_NOTEOL) != 0) options |= PCRE_NOTEOL;
00277 
00278 ((regex_t *)preg)->re_erroffset = (size_t)(-1);  /* Only has meaning after compile */
00279 
00280 /* When no string data is being returned, ensure that nmatch is zero.
00281 Otherwise, ensure the vector for holding the return data is large enough. */
00282 
00283 if (nosub) nmatch = 0;
00284 
00285 else if (nmatch > 0)
00286   {
00287   if (nmatch <= POSIX_MALLOC_THRESHOLD)
00288     {
00289     ovector = &(small_ovector[0]);
00290     }
00291   else
00292     {
00293     if (nmatch > INT_MAX/(sizeof(int) * 3)) return REG_ESPACE;
00294     ovector = (int *)malloc(sizeof(int) * nmatch * 3);
00295     if (ovector == NULL) return REG_ESPACE;
00296     allocated_ovector = TRUE;
00297     }
00298   }
00299 
00300 /* REG_STARTEND is a BSD extension, to allow for non-NUL-terminated strings.
00301 The man page from OS X says "REG_STARTEND affects only the location of the
00302 string, not how it is matched". That is why the "so" value is used to bump the
00303 start location rather than being passed as a PCRE "starting offset". */
00304 
00305 if ((eflags & REG_STARTEND) != 0)
00306   {
00307   so = pmatch[0].rm_so;
00308   eo = pmatch[0].rm_eo;
00309   }
00310 else
00311   {
00312   so = 0;
00313   eo = strlen(string);
00314   }
00315 
00316 rc = pcre_exec((const pcre *)preg->re_pcre, NULL, string + so, (eo - so),
00317   0, options, ovector, nmatch * 3);
00318 
00319 if (rc == 0) rc = nmatch;    /* All captured slots were filled in */
00320 
00321 if (rc >= 0)
00322   {
00323   size_t i;
00324   if (!nosub)
00325     {
00326     for (i = 0; i < (size_t)rc; i++)
00327       {
00328       pmatch[i].rm_so = ovector[i*2];
00329       pmatch[i].rm_eo = ovector[i*2+1];
00330       }
00331     if (allocated_ovector) free(ovector);
00332     for (; i < nmatch; i++) pmatch[i].rm_so = pmatch[i].rm_eo = -1;
00333     }
00334   return 0;
00335   }
00336 
00337 else
00338   {
00339   if (allocated_ovector) free(ovector);
00340   switch(rc)
00341     {
00342     case PCRE_ERROR_NOMATCH: return REG_NOMATCH;
00343     case PCRE_ERROR_NULL: return REG_INVARG;
00344     case PCRE_ERROR_BADOPTION: return REG_INVARG;
00345     case PCRE_ERROR_BADMAGIC: return REG_INVARG;
00346     case PCRE_ERROR_UNKNOWN_NODE: return REG_ASSERT;
00347     case PCRE_ERROR_NOMEMORY: return REG_ESPACE;
00348     case PCRE_ERROR_MATCHLIMIT: return REG_ESPACE;
00349     case PCRE_ERROR_BADUTF8: return REG_INVARG;
00350     case PCRE_ERROR_BADUTF8_OFFSET: return REG_INVARG;
00351     default: return REG_ASSERT;
00352     }
00353   }
00354 }
00355 
00356 /* End of pcreposix.c */

Generated on Tue Jul 5 14:11:58 2011 for ROOT_528-00b_version by  doxygen 1.5.1