00001 /************************************************* 00002 * Perl-Compatible Regular Expressions * 00003 *************************************************/ 00004 00005 /* PCRE is a library of functions to support regular expressions whose syntax 00006 and semantics are as close as possible to those of the Perl 5 language. 00007 00008 Written by Philip Hazel 00009 Copyright (c) 1997-2008 University of Cambridge 00010 00011 ----------------------------------------------------------------------------- 00012 Redistribution and use in source and binary forms, with or without 00013 modification, are permitted provided that the following conditions are met: 00014 00015 * Redistributions of source code must retain the above copyright notice, 00016 this list of conditions and the following disclaimer. 00017 00018 * Redistributions in binary form must reproduce the above copyright 00019 notice, this list of conditions and the following disclaimer in the 00020 documentation and/or other materials provided with the distribution. 00021 00022 * Neither the name of the University of Cambridge nor the names of its 00023 contributors may be used to endorse or promote products derived from 00024 this software without specific prior written permission. 00025 00026 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 00027 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 00028 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 00029 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 00030 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 00031 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 00032 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 00033 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 00034 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 00035 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00036 POSSIBILITY OF SUCH DAMAGE. 00037 ----------------------------------------------------------------------------- 00038 */ 00039 00040 00041 /* This module is a wrapper that provides a POSIX API to the underlying PCRE 00042 functions. */ 00043 00044 00045 #ifdef HAVE_CONFIG_H 00046 #include "config.h" 00047 #endif 00048 00049 00050 /* Ensure that the PCREPOSIX_EXP_xxx macros are set appropriately for 00051 compiling these functions. This must come before including pcreposix.h, where 00052 they are set for an application (using these functions) if they have not 00053 previously been set. */ 00054 00055 #if defined(_WIN32) && !defined(PCRE_STATIC) 00056 # define PCREPOSIX_EXP_DECL extern __declspec(dllexport) 00057 # define PCREPOSIX_EXP_DEFN __declspec(dllexport) 00058 #endif 00059 00060 #include "pcre.h" 00061 #include "pcre_internal.h" 00062 #include "pcreposix.h" 00063 00064 00065 /* Table to translate PCRE compile time error codes into POSIX error codes. */ 00066 00067 static const int eint[] = { 00068 0, /* no error */ 00069 REG_EESCAPE, /* \ at end of pattern */ 00070 REG_EESCAPE, /* \c at end of pattern */ 00071 REG_EESCAPE, /* unrecognized character follows \ */ 00072 REG_BADBR, /* numbers out of order in {} quantifier */ 00073 REG_BADBR, /* number too big in {} quantifier */ 00074 REG_EBRACK, /* missing terminating ] for character class */ 00075 REG_ECTYPE, /* invalid escape sequence in character class */ 00076 REG_ERANGE, /* range out of order in character class */ 00077 REG_BADRPT, /* nothing to repeat */ 00078 REG_BADRPT, /* operand of unlimited repeat could match the empty string */ 00079 REG_ASSERT, /* internal error: unexpected repeat */ 00080 REG_BADPAT, /* unrecognized character after (? */ 00081 REG_BADPAT, /* POSIX named classes are supported only within a class */ 00082 REG_EPAREN, /* missing ) */ 00083 REG_ESUBREG, /* reference to non-existent subpattern */ 00084 REG_INVARG, /* erroffset passed as NULL */ 00085 REG_INVARG, /* unknown option bit(s) set */ 00086 REG_EPAREN, /* missing ) after comment */ 00087 REG_ESIZE, /* parentheses nested too deeply */ 00088 REG_ESIZE, /* regular expression too large */ 00089 REG_ESPACE, /* failed to get memory */ 00090 REG_EPAREN, /* unmatched brackets */ 00091 REG_ASSERT, /* internal error: code overflow */ 00092 REG_BADPAT, /* unrecognized character after (?< */ 00093 REG_BADPAT, /* lookbehind assertion is not fixed length */ 00094 REG_BADPAT, /* malformed number or name after (?( */ 00095 REG_BADPAT, /* conditional group contains more than two branches */ 00096 REG_BADPAT, /* assertion expected after (?( */ 00097 REG_BADPAT, /* (?R or (?[+-]digits must be followed by ) */ 00098 REG_ECTYPE, /* unknown POSIX class name */ 00099 REG_BADPAT, /* POSIX collating elements are not supported */ 00100 REG_INVARG, /* this version of PCRE is not compiled with PCRE_UTF8 support */ 00101 REG_BADPAT, /* spare error */ 00102 REG_BADPAT, /* character value in \x{...} sequence is too large */ 00103 REG_BADPAT, /* invalid condition (?(0) */ 00104 REG_BADPAT, /* \C not allowed in lookbehind assertion */ 00105 REG_EESCAPE, /* PCRE does not support \L, \l, \N, \U, or \u */ 00106 REG_BADPAT, /* number after (?C is > 255 */ 00107 REG_BADPAT, /* closing ) for (?C expected */ 00108 REG_BADPAT, /* recursive call could loop indefinitely */ 00109 REG_BADPAT, /* unrecognized character after (?P */ 00110 REG_BADPAT, /* syntax error in subpattern name (missing terminator) */ 00111 REG_BADPAT, /* two named subpatterns have the same name */ 00112 REG_BADPAT, /* invalid UTF-8 string */ 00113 REG_BADPAT, /* support for \P, \p, and \X has not been compiled */ 00114 REG_BADPAT, /* malformed \P or \p sequence */ 00115 REG_BADPAT, /* unknown property name after \P or \p */ 00116 REG_BADPAT, /* subpattern name is too long (maximum 32 characters) */ 00117 REG_BADPAT, /* too many named subpatterns (maximum 10,000) */ 00118 REG_BADPAT, /* repeated subpattern is too long */ 00119 REG_BADPAT, /* octal value is greater than \377 (not in UTF-8 mode) */ 00120 REG_BADPAT, /* internal error: overran compiling workspace */ 00121 REG_BADPAT, /* internal error: previously-checked referenced subpattern not found */ 00122 REG_BADPAT, /* DEFINE group contains more than one branch */ 00123 REG_BADPAT, /* repeating a DEFINE group is not allowed */ 00124 REG_INVARG, /* inconsistent NEWLINE options */ 00125 REG_BADPAT, /* \g is not followed followed by an (optionally braced) non-zero number */ 00126 REG_BADPAT, /* (?+ or (?- must be followed by a non-zero number */ 00127 REG_BADPAT, /* number is too big */ 00128 REG_BADPAT, /* subpattern name expected */ 00129 REG_BADPAT, /* digit expected after (?+ */ 00130 REG_BADPAT /* ] is an invalid data character in JavaScript compatibility mode */ 00131 }; 00132 00133 /* Table of texts corresponding to POSIX error codes */ 00134 00135 static const char *const pstring[] = { 00136 "", /* Dummy for value 0 */ 00137 "internal error", /* REG_ASSERT */ 00138 "invalid repeat counts in {}", /* BADBR */ 00139 "pattern error", /* BADPAT */ 00140 "? * + invalid", /* BADRPT */ 00141 "unbalanced {}", /* EBRACE */ 00142 "unbalanced []", /* EBRACK */ 00143 "collation error - not relevant", /* ECOLLATE */ 00144 "bad class", /* ECTYPE */ 00145 "bad escape sequence", /* EESCAPE */ 00146 "empty expression", /* EMPTY */ 00147 "unbalanced ()", /* EPAREN */ 00148 "bad range inside []", /* ERANGE */ 00149 "expression too big", /* ESIZE */ 00150 "failed to get memory", /* ESPACE */ 00151 "bad back reference", /* ESUBREG */ 00152 "bad argument", /* INVARG */ 00153 "match failed" /* NOMATCH */ 00154 }; 00155 00156 00157 00158 00159 /************************************************* 00160 * Translate error code to string * 00161 *************************************************/ 00162 00163 PCREPOSIX_EXP_DEFN size_t PCRE_CALL_CONVENTION 00164 regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size) 00165 { 00166 const char *message, *addmessage; 00167 size_t length, addlength; 00168 00169 message = (errcode >= (int)(sizeof(pstring)/sizeof(char *)))? 00170 "unknown error code" : pstring[errcode]; 00171 length = strlen(message) + 1; 00172 00173 addmessage = " at offset "; 00174 addlength = (preg != NULL && (int)preg->re_erroffset != -1)? 00175 strlen(addmessage) + 6 : 0; 00176 00177 if (errbuf_size > 0) 00178 { 00179 if (addlength > 0 && errbuf_size >= length + addlength) 00180 sprintf(errbuf, "%s%s%-6d", message, addmessage, (int)preg->re_erroffset); 00181 else 00182 { 00183 strncpy(errbuf, message, errbuf_size - 1); 00184 errbuf[errbuf_size-1] = 0; 00185 } 00186 } 00187 00188 return length + addlength; 00189 } 00190 00191 00192 00193 00194 /************************************************* 00195 * Free store held by a regex * 00196 *************************************************/ 00197 00198 PCREPOSIX_EXP_DEFN void PCRE_CALL_CONVENTION 00199 regfree(regex_t *preg) 00200 { 00201 (pcre_free)(preg->re_pcre); 00202 } 00203 00204 00205 00206 00207 /************************************************* 00208 * Compile a regular expression * 00209 *************************************************/ 00210 00211 /* 00212 Arguments: 00213 preg points to a structure for recording the compiled expression 00214 pattern the pattern to compile 00215 cflags compilation flags 00216 00217 Returns: 0 on success 00218 various non-zero codes on failure 00219 */ 00220 00221 PCREPOSIX_EXP_DEFN int PCRE_CALL_CONVENTION 00222 regcomp(regex_t *preg, const char *pattern, int cflags) 00223 { 00224 const char *errorptr; 00225 int erroffset; 00226 int errorcode; 00227 int options = 0; 00228 00229 if ((cflags & REG_ICASE) != 0) options |= PCRE_CASELESS; 00230 if ((cflags & REG_NEWLINE) != 0) options |= PCRE_MULTILINE; 00231 if ((cflags & REG_DOTALL) != 0) options |= PCRE_DOTALL; 00232 if ((cflags & REG_NOSUB) != 0) options |= PCRE_NO_AUTO_CAPTURE; 00233 if ((cflags & REG_UTF8) != 0) options |= PCRE_UTF8; 00234 00235 preg->re_pcre = pcre_compile2(pattern, options, &errorcode, &errorptr, 00236 &erroffset, NULL); 00237 preg->re_erroffset = erroffset; 00238 00239 if (preg->re_pcre == NULL) return eint[errorcode]; 00240 00241 preg->re_nsub = pcre_info((const pcre *)preg->re_pcre, NULL, NULL); 00242 return 0; 00243 } 00244 00245 00246 00247 00248 /************************************************* 00249 * Match a regular expression * 00250 *************************************************/ 00251 00252 /* Unfortunately, PCRE requires 3 ints of working space for each captured 00253 substring, so we have to get and release working store instead of just using 00254 the POSIX structures as was done in earlier releases when PCRE needed only 2 00255 ints. However, if the number of possible capturing brackets is small, use a 00256 block of store on the stack, to reduce the use of malloc/free. The threshold is 00257 in a macro that can be changed at configure time. 00258 00259 If REG_NOSUB was specified at compile time, the PCRE_NO_AUTO_CAPTURE flag will 00260 be set. When this is the case, the nmatch and pmatch arguments are ignored, and 00261 the only result is yes/no/error. */ 00262 00263 PCREPOSIX_EXP_DEFN int PCRE_CALL_CONVENTION 00264 regexec(const regex_t *preg, const char *string, size_t nmatch, 00265 regmatch_t pmatch[], int eflags) 00266 { 00267 int rc, so, eo; 00268 int options = 0; 00269 int *ovector = NULL; 00270 int small_ovector[POSIX_MALLOC_THRESHOLD * 3]; 00271 BOOL allocated_ovector = FALSE; 00272 BOOL nosub = 00273 (((const pcre *)preg->re_pcre)->options & PCRE_NO_AUTO_CAPTURE) != 0; 00274 00275 if ((eflags & REG_NOTBOL) != 0) options |= PCRE_NOTBOL; 00276 if ((eflags & REG_NOTEOL) != 0) options |= PCRE_NOTEOL; 00277 00278 ((regex_t *)preg)->re_erroffset = (size_t)(-1); /* Only has meaning after compile */ 00279 00280 /* When no string data is being returned, ensure that nmatch is zero. 00281 Otherwise, ensure the vector for holding the return data is large enough. */ 00282 00283 if (nosub) nmatch = 0; 00284 00285 else if (nmatch > 0) 00286 { 00287 if (nmatch <= POSIX_MALLOC_THRESHOLD) 00288 { 00289 ovector = &(small_ovector[0]); 00290 } 00291 else 00292 { 00293 if (nmatch > INT_MAX/(sizeof(int) * 3)) return REG_ESPACE; 00294 ovector = (int *)malloc(sizeof(int) * nmatch * 3); 00295 if (ovector == NULL) return REG_ESPACE; 00296 allocated_ovector = TRUE; 00297 } 00298 } 00299 00300 /* REG_STARTEND is a BSD extension, to allow for non-NUL-terminated strings. 00301 The man page from OS X says "REG_STARTEND affects only the location of the 00302 string, not how it is matched". That is why the "so" value is used to bump the 00303 start location rather than being passed as a PCRE "starting offset". */ 00304 00305 if ((eflags & REG_STARTEND) != 0) 00306 { 00307 so = pmatch[0].rm_so; 00308 eo = pmatch[0].rm_eo; 00309 } 00310 else 00311 { 00312 so = 0; 00313 eo = strlen(string); 00314 } 00315 00316 rc = pcre_exec((const pcre *)preg->re_pcre, NULL, string + so, (eo - so), 00317 0, options, ovector, nmatch * 3); 00318 00319 if (rc == 0) rc = nmatch; /* All captured slots were filled in */ 00320 00321 if (rc >= 0) 00322 { 00323 size_t i; 00324 if (!nosub) 00325 { 00326 for (i = 0; i < (size_t)rc; i++) 00327 { 00328 pmatch[i].rm_so = ovector[i*2]; 00329 pmatch[i].rm_eo = ovector[i*2+1]; 00330 } 00331 if (allocated_ovector) free(ovector); 00332 for (; i < nmatch; i++) pmatch[i].rm_so = pmatch[i].rm_eo = -1; 00333 } 00334 return 0; 00335 } 00336 00337 else 00338 { 00339 if (allocated_ovector) free(ovector); 00340 switch(rc) 00341 { 00342 case PCRE_ERROR_NOMATCH: return REG_NOMATCH; 00343 case PCRE_ERROR_NULL: return REG_INVARG; 00344 case PCRE_ERROR_BADOPTION: return REG_INVARG; 00345 case PCRE_ERROR_BADMAGIC: return REG_INVARG; 00346 case PCRE_ERROR_UNKNOWN_NODE: return REG_ASSERT; 00347 case PCRE_ERROR_NOMEMORY: return REG_ESPACE; 00348 case PCRE_ERROR_MATCHLIMIT: return REG_ESPACE; 00349 case PCRE_ERROR_BADUTF8: return REG_INVARG; 00350 case PCRE_ERROR_BADUTF8_OFFSET: return REG_INVARG; 00351 default: return REG_ASSERT; 00352 } 00353 } 00354 } 00355 00356 /* End of pcreposix.c */