pcre_study.c

Go to the documentation of this file.
00001 /*************************************************
00002 *      Perl-Compatible Regular Expressions       *
00003 *************************************************/
00004 
00005 /* PCRE is a library of functions to support regular expressions whose syntax
00006 and semantics are as close as possible to those of the Perl 5 language.
00007 
00008                        Written by Philip Hazel
00009            Copyright (c) 1997-2008 University of Cambridge
00010 
00011 -----------------------------------------------------------------------------
00012 Redistribution and use in source and binary forms, with or without
00013 modification, are permitted provided that the following conditions are met:
00014 
00015     * Redistributions of source code must retain the above copyright notice,
00016       this list of conditions and the following disclaimer.
00017 
00018     * Redistributions in binary form must reproduce the above copyright
00019       notice, this list of conditions and the following disclaimer in the
00020       documentation and/or other materials provided with the distribution.
00021 
00022     * Neither the name of the University of Cambridge nor the names of its
00023       contributors may be used to endorse or promote products derived from
00024       this software without specific prior written permission.
00025 
00026 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
00027 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00028 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00029 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
00030 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
00031 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
00032 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
00033 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
00034 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
00035 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00036 POSSIBILITY OF SUCH DAMAGE.
00037 -----------------------------------------------------------------------------
00038 */
00039 
00040 
00041 /* This module contains the external function pcre_study(), along with local
00042 supporting functions. */
00043 
00044 
00045 #ifdef HAVE_CONFIG_H
00046 #include "config.h"
00047 #endif
00048 
00049 #include "pcre_internal.h"
00050 
00051 
00052 /* Returns from set_start_bits() */
00053 
00054 enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };
00055 
00056 
00057 /*************************************************
00058 *      Set a bit and maybe its alternate case    *
00059 *************************************************/
00060 
00061 /* Given a character, set its bit in the table, and also the bit for the other
00062 version of a letter if we are caseless.
00063 
00064 Arguments:
00065   start_bits    points to the bit map
00066   c             is the character
00067   caseless      the caseless flag
00068   cd            the block with char table pointers
00069 
00070 Returns:        nothing
00071 */
00072 
00073 static void
00074 set_bit(uschar *start_bits, unsigned int c, BOOL caseless, compile_data *cd)
00075 {
00076 start_bits[c/8] |= (1 << (c&7));
00077 if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
00078   start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
00079 }
00080 
00081 
00082 
00083 /*************************************************
00084 *          Create bitmap of starting bytes       *
00085 *************************************************/
00086 
00087 /* This function scans a compiled unanchored expression recursively and
00088 attempts to build a bitmap of the set of possible starting bytes. As time goes
00089 by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
00090 useful for parenthesized groups in patterns such as (a*)b where the group
00091 provides some optional starting bytes but scanning must continue at the outer
00092 level to find at least one mandatory byte. At the outermost level, this
00093 function fails unless the result is SSB_DONE.
00094 
00095 Arguments:
00096   code         points to an expression
00097   start_bits   points to a 32-byte table, initialized to 0
00098   caseless     the current state of the caseless flag
00099   utf8         TRUE if in UTF-8 mode
00100   cd           the block with char table pointers
00101 
00102 Returns:       SSB_FAIL     => Failed to find any starting bytes
00103                SSB_DONE     => Found mandatory starting bytes
00104                SSB_CONTINUE => Found optional starting bytes
00105 */
00106 
00107 static int
00108 set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
00109   BOOL utf8, compile_data *cd)
00110 {
00111 register int c;
00112 int yield = SSB_DONE;
00113 
00114 #if 0
00115 /* ========================================================================= */
00116 /* The following comment and code was inserted in January 1999. In May 2006,
00117 when it was observed to cause compiler warnings about unused values, I took it
00118 out again. If anybody is still using OS/2, they will have to put it back
00119 manually. */
00120 
00121 /* This next statement and the later reference to dummy are here in order to
00122 trick the optimizer of the IBM C compiler for OS/2 into generating correct
00123 code. Apparently IBM isn't going to fix the problem, and we would rather not
00124 disable optimization (in this module it actually makes a big difference, and
00125 the pcre module can use all the optimization it can get). */
00126 
00127 volatile int dummy;
00128 /* ========================================================================= */
00129 #endif
00130 
00131 do
00132   {
00133   const uschar *tcode = code + (((int)*code == OP_CBRA)? 3:1) + LINK_SIZE;
00134   BOOL try_next = TRUE;
00135 
00136   while (try_next)    /* Loop for items in this branch */
00137     {
00138     int rc;
00139     switch(*tcode)
00140       {
00141       /* Fail if we reach something we don't understand */
00142 
00143       default:
00144       return SSB_FAIL;
00145 
00146       /* If we hit a bracket or a positive lookahead assertion, recurse to set
00147       bits from within the subpattern. If it can't find anything, we have to
00148       give up. If it finds some mandatory character(s), we are done for this
00149       branch. Otherwise, carry on scanning after the subpattern. */
00150 
00151       case OP_BRA:
00152       case OP_SBRA:
00153       case OP_CBRA:
00154       case OP_SCBRA:
00155       case OP_ONCE:
00156       case OP_ASSERT:
00157       rc = set_start_bits(tcode, start_bits, caseless, utf8, cd);
00158       if (rc == SSB_FAIL) return SSB_FAIL;
00159       if (rc == SSB_DONE) try_next = FALSE; else
00160         {
00161         do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
00162         tcode += 1 + LINK_SIZE;
00163         }
00164       break;
00165 
00166       /* If we hit ALT or KET, it means we haven't found anything mandatory in
00167       this branch, though we might have found something optional. For ALT, we
00168       continue with the next alternative, but we have to arrange that the final
00169       result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
00170       return SSB_CONTINUE: if this is the top level, that indicates failure,
00171       but after a nested subpattern, it causes scanning to continue. */
00172 
00173       case OP_ALT:
00174       yield = SSB_CONTINUE;
00175       try_next = FALSE;
00176       break;
00177 
00178       case OP_KET:
00179       case OP_KETRMAX:
00180       case OP_KETRMIN:
00181       return SSB_CONTINUE;
00182 
00183       /* Skip over callout */
00184 
00185       case OP_CALLOUT:
00186       tcode += 2 + 2*LINK_SIZE;
00187       break;
00188 
00189       /* Skip over lookbehind and negative lookahead assertions */
00190 
00191       case OP_ASSERT_NOT:
00192       case OP_ASSERTBACK:
00193       case OP_ASSERTBACK_NOT:
00194       do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
00195       tcode += 1 + LINK_SIZE;
00196       break;
00197 
00198       /* Skip over an option setting, changing the caseless flag */
00199 
00200       case OP_OPT:
00201       caseless = (tcode[1] & PCRE_CASELESS) != 0;
00202       tcode += 2;
00203       break;
00204 
00205       /* BRAZERO does the bracket, but carries on. */
00206 
00207       case OP_BRAZERO:
00208       case OP_BRAMINZERO:
00209       if (set_start_bits(++tcode, start_bits, caseless, utf8, cd) == SSB_FAIL)
00210         return SSB_FAIL;
00211 /* =========================================================================
00212       See the comment at the head of this function concerning the next line,
00213       which was an old fudge for the benefit of OS/2.
00214       dummy = 1;
00215   ========================================================================= */
00216       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
00217       tcode += 1 + LINK_SIZE;
00218       break;
00219 
00220       /* SKIPZERO skips the bracket. */
00221 
00222       case OP_SKIPZERO:
00223       tcode++;
00224       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
00225       tcode += 1 + LINK_SIZE;
00226       break;
00227 
00228       /* Single-char * or ? sets the bit and tries the next item */
00229 
00230       case OP_STAR:
00231       case OP_MINSTAR:
00232       case OP_POSSTAR:
00233       case OP_QUERY:
00234       case OP_MINQUERY:
00235       case OP_POSQUERY:
00236       set_bit(start_bits, tcode[1], caseless, cd);
00237       tcode += 2;
00238 #ifdef SUPPORT_UTF8
00239       if (utf8 && tcode[-1] >= 0xc0)
00240         tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
00241 #endif
00242       break;
00243 
00244       /* Single-char upto sets the bit and tries the next */
00245 
00246       case OP_UPTO:
00247       case OP_MINUPTO:
00248       case OP_POSUPTO:
00249       set_bit(start_bits, tcode[3], caseless, cd);
00250       tcode += 4;
00251 #ifdef SUPPORT_UTF8
00252       if (utf8 && tcode[-1] >= 0xc0)
00253         tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
00254 #endif
00255       break;
00256 
00257       /* At least one single char sets the bit and stops */
00258 
00259       case OP_EXACT:       /* Fall through */
00260       tcode += 2;
00261 
00262       case OP_CHAR:
00263       case OP_CHARNC:
00264       case OP_PLUS:
00265       case OP_MINPLUS:
00266       case OP_POSPLUS:
00267       set_bit(start_bits, tcode[1], caseless, cd);
00268       try_next = FALSE;
00269       break;
00270 
00271       /* Single character type sets the bits and stops */
00272 
00273       case OP_NOT_DIGIT:
00274       for (c = 0; c < 32; c++)
00275         start_bits[c] |= ~cd->cbits[c+cbit_digit];
00276       try_next = FALSE;
00277       break;
00278 
00279       case OP_DIGIT:
00280       for (c = 0; c < 32; c++)
00281         start_bits[c] |= cd->cbits[c+cbit_digit];
00282       try_next = FALSE;
00283       break;
00284 
00285       /* The cbit_space table has vertical tab as whitespace; we have to
00286       discard it. */
00287 
00288       case OP_NOT_WHITESPACE:
00289       for (c = 0; c < 32; c++)
00290         {
00291         int d = cd->cbits[c+cbit_space];
00292         if (c == 1) d &= ~0x08;
00293         start_bits[c] |= ~d;
00294         }
00295       try_next = FALSE;
00296       break;
00297 
00298       /* The cbit_space table has vertical tab as whitespace; we have to
00299       discard it. */
00300 
00301       case OP_WHITESPACE:
00302       for (c = 0; c < 32; c++)
00303         {
00304         int d = cd->cbits[c+cbit_space];
00305         if (c == 1) d &= ~0x08;
00306         start_bits[c] |= d;
00307         }
00308       try_next = FALSE;
00309       break;
00310 
00311       case OP_NOT_WORDCHAR:
00312       for (c = 0; c < 32; c++)
00313         start_bits[c] |= ~cd->cbits[c+cbit_word];
00314       try_next = FALSE;
00315       break;
00316 
00317       case OP_WORDCHAR:
00318       for (c = 0; c < 32; c++)
00319         start_bits[c] |= cd->cbits[c+cbit_word];
00320       try_next = FALSE;
00321       break;
00322 
00323       /* One or more character type fudges the pointer and restarts, knowing
00324       it will hit a single character type and stop there. */
00325 
00326       case OP_TYPEPLUS:
00327       case OP_TYPEMINPLUS:
00328       tcode++;
00329       break;
00330 
00331       case OP_TYPEEXACT:
00332       tcode += 3;
00333       break;
00334 
00335       /* Zero or more repeats of character types set the bits and then
00336       try again. */
00337 
00338       case OP_TYPEUPTO:
00339       case OP_TYPEMINUPTO:
00340       case OP_TYPEPOSUPTO:
00341       tcode += 2;               /* Fall through */
00342 
00343       case OP_TYPESTAR:
00344       case OP_TYPEMINSTAR:
00345       case OP_TYPEPOSSTAR:
00346       case OP_TYPEQUERY:
00347       case OP_TYPEMINQUERY:
00348       case OP_TYPEPOSQUERY:
00349       switch(tcode[1])
00350         {
00351         case OP_ANY:
00352         case OP_ALLANY:
00353         return SSB_FAIL;
00354 
00355         case OP_NOT_DIGIT:
00356         for (c = 0; c < 32; c++)
00357           start_bits[c] |= ~cd->cbits[c+cbit_digit];
00358         break;
00359 
00360         case OP_DIGIT:
00361         for (c = 0; c < 32; c++)
00362           start_bits[c] |= cd->cbits[c+cbit_digit];
00363         break;
00364 
00365         /* The cbit_space table has vertical tab as whitespace; we have to
00366         discard it. */
00367 
00368         case OP_NOT_WHITESPACE:
00369         for (c = 0; c < 32; c++)
00370           {
00371           int d = cd->cbits[c+cbit_space];
00372           if (c == 1) d &= ~0x08;
00373           start_bits[c] |= ~d;
00374           }
00375         break;
00376 
00377         /* The cbit_space table has vertical tab as whitespace; we have to
00378         discard it. */
00379 
00380         case OP_WHITESPACE:
00381         for (c = 0; c < 32; c++)
00382           {
00383           int d = cd->cbits[c+cbit_space];
00384           if (c == 1) d &= ~0x08;
00385           start_bits[c] |= d;
00386           }
00387         break;
00388 
00389         case OP_NOT_WORDCHAR:
00390         for (c = 0; c < 32; c++)
00391           start_bits[c] |= ~cd->cbits[c+cbit_word];
00392         break;
00393 
00394         case OP_WORDCHAR:
00395         for (c = 0; c < 32; c++)
00396           start_bits[c] |= cd->cbits[c+cbit_word];
00397         break;
00398         }
00399 
00400       tcode += 2;
00401       break;
00402 
00403       /* Character class where all the information is in a bit map: set the
00404       bits and either carry on or not, according to the repeat count. If it was
00405       a negative class, and we are operating with UTF-8 characters, any byte
00406       with a value >= 0xc4 is a potentially valid starter because it starts a
00407       character with a value > 255. */
00408 
00409       case OP_NCLASS:
00410 #ifdef SUPPORT_UTF8
00411       if (utf8)
00412         {
00413         start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */
00414         memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */
00415         }
00416 #endif
00417       /* Fall through */
00418 
00419       case OP_CLASS:
00420         {
00421         tcode++;
00422 
00423         /* In UTF-8 mode, the bits in a bit map correspond to character
00424         values, not to byte values. However, the bit map we are constructing is
00425         for byte values. So we have to do a conversion for characters whose
00426         value is > 127. In fact, there are only two possible starting bytes for
00427         characters in the range 128 - 255. */
00428 
00429 #ifdef SUPPORT_UTF8
00430         if (utf8)
00431           {
00432           for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
00433           for (c = 128; c < 256; c++)
00434             {
00435             if ((tcode[c/8] && (1 << (c&7))) != 0)
00436               {
00437               int d = (c >> 6) | 0xc0;            /* Set bit for this starter */
00438               start_bits[d/8] |= (1 << (d&7));    /* and then skip on to the */
00439               c = (c & 0xc0) + 0x40 - 1;          /* next relevant character. */
00440               }
00441             }
00442           }
00443 
00444         /* In non-UTF-8 mode, the two bit maps are completely compatible. */
00445 
00446         else
00447 #endif
00448           {
00449           for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
00450           }
00451 
00452         /* Advance past the bit map, and act on what follows */
00453 
00454         tcode += 32;
00455         switch (*tcode)
00456           {
00457           case OP_CRSTAR:
00458           case OP_CRMINSTAR:
00459           case OP_CRQUERY:
00460           case OP_CRMINQUERY:
00461           tcode++;
00462           break;
00463 
00464           case OP_CRRANGE:
00465           case OP_CRMINRANGE:
00466           if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
00467             else try_next = FALSE;
00468           break;
00469 
00470           default:
00471           try_next = FALSE;
00472           break;
00473           }
00474         }
00475       break; /* End of bitmap class handling */
00476 
00477       }      /* End of switch */
00478     }        /* End of try_next loop */
00479 
00480   code += GET(code, 1);   /* Advance to next branch */
00481   }
00482 while (*code == OP_ALT);
00483 return yield;
00484 }
00485 
00486 
00487 
00488 /*************************************************
00489 *          Study a compiled expression           *
00490 *************************************************/
00491 
00492 /* This function is handed a compiled expression that it must study to produce
00493 information that will speed up the matching. It returns a pcre_extra block
00494 which then gets handed back to pcre_exec().
00495 
00496 Arguments:
00497   re        points to the compiled expression
00498   options   contains option bits
00499   errorptr  points to where to place error messages;
00500             set NULL unless error
00501 
00502 Returns:    pointer to a pcre_extra block, with study_data filled in and the
00503               appropriate flag set;
00504             NULL on error or if no optimization possible
00505 */
00506 
00507 PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
00508 pcre_study(const pcre *external_re, int options, const char **errorptr)
00509 {
00510 uschar start_bits[32];
00511 pcre_extra *extra;
00512 pcre_study_data *study;
00513 const uschar *tables;
00514 uschar *code;
00515 compile_data compile_block;
00516 const real_pcre *re = (const real_pcre *)external_re;
00517 
00518 *errorptr = NULL;
00519 
00520 if (re == NULL || re->magic_number != MAGIC_NUMBER)
00521   {
00522   *errorptr = "argument is not a compiled regular expression";
00523   return NULL;
00524   }
00525 
00526 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
00527   {
00528   *errorptr = "unknown or incorrect option bit(s) set";
00529   return NULL;
00530   }
00531 
00532 code = (uschar *)re + re->name_table_offset +
00533   (re->name_count * re->name_entry_size);
00534 
00535 /* For an anchored pattern, or an unanchored pattern that has a first char, or
00536 a multiline pattern that matches only at "line starts", no further processing
00537 at present. */
00538 
00539 if ((re->options & PCRE_ANCHORED) != 0 ||
00540     (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
00541   return NULL;
00542 
00543 /* Set the character tables in the block that is passed around */
00544 
00545 tables = re->tables;
00546 if (tables == NULL)
00547   (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
00548   (void *)(&tables));
00549 
00550 compile_block.lcc = tables + lcc_offset;
00551 compile_block.fcc = tables + fcc_offset;
00552 compile_block.cbits = tables + cbits_offset;
00553 compile_block.ctypes = tables + ctypes_offset;
00554 
00555 /* See if we can find a fixed set of initial characters for the pattern. */
00556 
00557 memset(start_bits, 0, 32 * sizeof(uschar));
00558 if (set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,
00559   (re->options & PCRE_UTF8) != 0, &compile_block) != SSB_DONE) return NULL;
00560 
00561 /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
00562 the latter, which is pointed to by the former, which may also get additional
00563 data set later by the calling program. At the moment, the size of
00564 pcre_study_data is fixed. We nevertheless save it in a field for returning via
00565 the pcre_fullinfo() function so that if it becomes variable in the future, we
00566 don't have to change that code. */
00567 
00568 extra = (pcre_extra *)(pcre_malloc)
00569   (sizeof(pcre_extra) + sizeof(pcre_study_data));
00570 
00571 if (extra == NULL)
00572   {
00573   *errorptr = "failed to get memory";
00574   return NULL;
00575   }
00576 
00577 study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
00578 extra->flags = PCRE_EXTRA_STUDY_DATA;
00579 extra->study_data = study;
00580 
00581 study->size = sizeof(pcre_study_data);
00582 study->options = PCRE_STUDY_MAPPED;
00583 memcpy(study->start_bits, start_bits, sizeof(start_bits));
00584 
00585 return extra;
00586 }
00587 
00588 /* End of pcre_study.c */

Generated on Tue Jul 5 14:11:57 2011 for ROOT_528-00b_version by  doxygen 1.5.1