pcre_dfa_exec.c

Go to the documentation of this file.
00001 /*************************************************
00002 *      Perl-Compatible Regular Expressions       *
00003 *************************************************/
00004 
00005 /* PCRE is a library of functions to support regular expressions whose syntax
00006 and semantics are as close as possible to those of the Perl 5 language.
00007 
00008                        Written by Philip Hazel
00009            Copyright (c) 1997-2008 University of Cambridge
00010 
00011 -----------------------------------------------------------------------------
00012 Redistribution and use in source and binary forms, with or without
00013 modification, are permitted provided that the following conditions are met:
00014 
00015     * Redistributions of source code must retain the above copyright notice,
00016       this list of conditions and the following disclaimer.
00017 
00018     * Redistributions in binary form must reproduce the above copyright
00019       notice, this list of conditions and the following disclaimer in the
00020       documentation and/or other materials provided with the distribution.
00021 
00022     * Neither the name of the University of Cambridge nor the names of its
00023       contributors may be used to endorse or promote products derived from
00024       this software without specific prior written permission.
00025 
00026 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
00027 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00028 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00029 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
00030 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
00031 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
00032 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
00033 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
00034 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
00035 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00036 POSSIBILITY OF SUCH DAMAGE.
00037 -----------------------------------------------------------------------------
00038 */
00039 
00040 
00041 /* This module contains the external function pcre_dfa_exec(), which is an
00042 alternative matching function that uses a sort of DFA algorithm (not a true
00043 FSM). This is NOT Perl- compatible, but it has advantages in certain
00044 applications. */
00045 
00046 
00047 #ifdef HAVE_CONFIG_H
00048 #include "config.h"
00049 #endif
00050 
00051 #define NLBLOCK md             /* Block containing newline information */
00052 #define PSSTART start_subject  /* Field containing processed string start */
00053 #define PSEND   end_subject    /* Field containing processed string end */
00054 
00055 #include "pcre_internal.h"
00056 
00057 
00058 /* For use to indent debugging output */
00059 
00060 #define SP "                   "
00061 
00062 
00063 
00064 /*************************************************
00065 *      Code parameters and static tables         *
00066 *************************************************/
00067 
00068 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
00069 into others, under special conditions. A gap of 20 between the blocks should be
00070 enough. The resulting opcodes don't have to be less than 256 because they are
00071 never stored, so we push them well clear of the normal opcodes. */
00072 
00073 #define OP_PROP_EXTRA       300
00074 #define OP_EXTUNI_EXTRA     320
00075 #define OP_ANYNL_EXTRA      340
00076 #define OP_HSPACE_EXTRA     360
00077 #define OP_VSPACE_EXTRA     380
00078 
00079 
00080 /* This table identifies those opcodes that are followed immediately by a
00081 character that is to be tested in some way. This makes is possible to
00082 centralize the loading of these characters. In the case of Type * etc, the
00083 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
00084 small value. ***NOTE*** If the start of this table is modified, the two tables
00085 that follow must also be modified. */
00086 
00087 static const uschar coptable[] = {
00088   0,                             /* End                                    */
00089   0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
00090   0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
00091   0, 0, 0,                       /* Any, AllAny, Anybyte                   */
00092   0, 0, 0,                       /* NOTPROP, PROP, EXTUNI                  */
00093   0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
00094   0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */
00095   1,                             /* Char                                   */
00096   1,                             /* Charnc                                 */
00097   1,                             /* not                                    */
00098   /* Positive single-char repeats                                          */
00099   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
00100   3, 3, 3,                       /* upto, minupto, exact                   */
00101   1, 1, 1, 3,                    /* *+, ++, ?+, upto+                      */
00102   /* Negative single-char repeats - only for chars < 256                   */
00103   1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
00104   3, 3, 3,                       /* NOT upto, minupto, exact               */
00105   1, 1, 1, 3,                    /* NOT *+, ++, ?+, updo+                  */
00106   /* Positive type repeats                                                 */
00107   1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
00108   3, 3, 3,                       /* Type upto, minupto, exact              */
00109   1, 1, 1, 3,                    /* Type *+, ++, ?+, upto+                 */
00110   /* Character class & ref repeats                                         */
00111   0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */
00112   0, 0,                          /* CRRANGE, CRMINRANGE                    */
00113   0,                             /* CLASS                                  */
00114   0,                             /* NCLASS                                 */
00115   0,                             /* XCLASS - variable length               */
00116   0,                             /* REF                                    */
00117   0,                             /* RECURSE                                */
00118   0,                             /* CALLOUT                                */
00119   0,                             /* Alt                                    */
00120   0,                             /* Ket                                    */
00121   0,                             /* KetRmax                                */
00122   0,                             /* KetRmin                                */
00123   0,                             /* Assert                                 */
00124   0,                             /* Assert not                             */
00125   0,                             /* Assert behind                          */
00126   0,                             /* Assert behind not                      */
00127   0,                             /* Reverse                                */
00128   0, 0, 0, 0,                    /* ONCE, BRA, CBRA, COND                  */
00129   0, 0, 0,                       /* SBRA, SCBRA, SCOND                     */
00130   0,                             /* CREF                                   */
00131   0,                             /* RREF                                   */
00132   0,                             /* DEF                                    */
00133   0, 0,                          /* BRAZERO, BRAMINZERO                    */
00134   0, 0, 0, 0,                    /* PRUNE, SKIP, THEN, COMMIT              */
00135   0, 0, 0                        /* FAIL, ACCEPT, SKIPZERO                 */
00136 };
00137 
00138 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
00139 and \w */
00140 
00141 static const uschar toptable1[] = {
00142   0, 0, 0, 0, 0, 0,
00143   ctype_digit, ctype_digit,
00144   ctype_space, ctype_space,
00145   ctype_word,  ctype_word,
00146   0, 0                            /* OP_ANY, OP_ALLANY */
00147 };
00148 
00149 static const uschar toptable2[] = {
00150   0, 0, 0, 0, 0, 0,
00151   ctype_digit, 0,
00152   ctype_space, 0,
00153   ctype_word,  0,
00154   1, 1                            /* OP_ANY, OP_ALLANY */
00155 };
00156 
00157 
00158 /* Structure for holding data about a particular state, which is in effect the
00159 current data for an active path through the match tree. It must consist
00160 entirely of ints because the working vector we are passed, and which we put
00161 these structures in, is a vector of ints. */
00162 
00163 typedef struct stateblock {
00164   int offset;                     /* Offset to opcode */
00165   int count;                      /* Count for repeats */
00166   int ims;                        /* ims flag bits */
00167   int data;                       /* Some use extra data */
00168 } stateblock;
00169 
00170 #define INTS_PER_STATEBLOCK  (sizeof(stateblock)/sizeof(int))
00171 
00172 
00173 #ifdef DEBUG
00174 /*************************************************
00175 *             Print character string             *
00176 *************************************************/
00177 
00178 /* Character string printing function for debugging.
00179 
00180 Arguments:
00181   p            points to string
00182   length       number of bytes
00183   f            where to print
00184 
00185 Returns:       nothing
00186 */
00187 
00188 static void
00189 pchars(unsigned char *p, int length, FILE *f)
00190 {
00191 int c;
00192 while (length-- > 0)
00193   {
00194   if (isprint(c = *(p++)))
00195     fprintf(f, "%c", c);
00196   else
00197     fprintf(f, "\\x%02x", c);
00198   }
00199 }
00200 #endif
00201 
00202 
00203 
00204 /*************************************************
00205 *    Execute a Regular Expression - DFA engine   *
00206 *************************************************/
00207 
00208 /* This internal function applies a compiled pattern to a subject string,
00209 starting at a given point, using a DFA engine. This function is called from the
00210 external one, possibly multiple times if the pattern is not anchored. The
00211 function calls itself recursively for some kinds of subpattern.
00212 
00213 Arguments:
00214   md                the match_data block with fixed information
00215   this_start_code   the opening bracket of this subexpression's code
00216   current_subject   where we currently are in the subject string
00217   start_offset      start offset in the subject string
00218   offsets           vector to contain the matching string offsets
00219   offsetcount       size of same
00220   workspace         vector of workspace
00221   wscount           size of same
00222   ims               the current ims flags
00223   rlevel            function call recursion level
00224   recursing         regex recursive call level
00225 
00226 Returns:            > 0 => number of match offset pairs placed in offsets
00227                     = 0 => offsets overflowed; longest matches are present
00228                      -1 => failed to match
00229                    < -1 => some kind of unexpected problem
00230 
00231 The following macros are used for adding states to the two state vectors (one
00232 for the current character, one for the following character). */
00233 
00234 #define ADD_ACTIVE(x,y) \
00235   if (active_count++ < wscount) \
00236     { \
00237     next_active_state->offset = (x); \
00238     next_active_state->count  = (y); \
00239     next_active_state->ims    = ims; \
00240     next_active_state++; \
00241     DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
00242     } \
00243   else return PCRE_ERROR_DFA_WSSIZE
00244 
00245 #define ADD_ACTIVE_DATA(x,y,z) \
00246   if (active_count++ < wscount) \
00247     { \
00248     next_active_state->offset = (x); \
00249     next_active_state->count  = (y); \
00250     next_active_state->ims    = ims; \
00251     next_active_state->data   = (z); \
00252     next_active_state++; \
00253     DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
00254     } \
00255   else return PCRE_ERROR_DFA_WSSIZE
00256 
00257 #define ADD_NEW(x,y) \
00258   if (new_count++ < wscount) \
00259     { \
00260     next_new_state->offset = (x); \
00261     next_new_state->count  = (y); \
00262     next_new_state->ims    = ims; \
00263     next_new_state++; \
00264     DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
00265     } \
00266   else return PCRE_ERROR_DFA_WSSIZE
00267 
00268 #define ADD_NEW_DATA(x,y,z) \
00269   if (new_count++ < wscount) \
00270     { \
00271     next_new_state->offset = (x); \
00272     next_new_state->count  = (y); \
00273     next_new_state->ims    = ims; \
00274     next_new_state->data   = (z); \
00275     next_new_state++; \
00276     DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
00277     } \
00278   else return PCRE_ERROR_DFA_WSSIZE
00279 
00280 /* And now, here is the code */
00281 
00282 static int
00283 internal_dfa_exec(
00284   dfa_match_data *md,
00285   const uschar *this_start_code,
00286   const uschar *current_subject,
00287   int start_offset,
00288   int *offsets,
00289   int offsetcount,
00290   int *workspace,
00291   int wscount,
00292   int ims,
00293   int  rlevel,
00294   int  recursing)
00295 {
00296 stateblock *active_states, *new_states, *temp_states;
00297 stateblock *next_active_state, *next_new_state;
00298 
00299 const uschar *ctypes, *lcc, *fcc;
00300 const uschar *ptr;
00301 const uschar *end_code, *first_op;
00302 
00303 int active_count, new_count, match_count;
00304 
00305 /* Some fields in the md block are frequently referenced, so we load them into
00306 independent variables in the hope that this will perform better. */
00307 
00308 const uschar *start_subject = md->start_subject;
00309 const uschar *end_subject = md->end_subject;
00310 const uschar *start_code = md->start_code;
00311 
00312 #ifdef SUPPORT_UTF8
00313 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
00314 #else
00315 BOOL utf8 = FALSE;
00316 #endif
00317 
00318 rlevel++;
00319 offsetcount &= (-2);
00320 
00321 wscount -= 2;
00322 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
00323           (2 * INTS_PER_STATEBLOCK);
00324 
00325 DPRINTF(("\n%.*s---------------------\n"
00326   "%.*sCall to internal_dfa_exec f=%d r=%d\n",
00327   rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
00328 
00329 ctypes = md->tables + ctypes_offset;
00330 lcc = md->tables + lcc_offset;
00331 fcc = md->tables + fcc_offset;
00332 
00333 match_count = PCRE_ERROR_NOMATCH;   /* A negative number */
00334 
00335 active_states = (stateblock *)(workspace + 2);
00336 next_new_state = new_states = active_states + wscount;
00337 new_count = 0;
00338 
00339 first_op = this_start_code + 1 + LINK_SIZE +
00340   ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
00341 
00342 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
00343 the alternative states onto the list, and find out where the end is. This
00344 makes is possible to use this function recursively, when we want to stop at a
00345 matching internal ket rather than at the end.
00346 
00347 If the first opcode in the first alternative is OP_REVERSE, we are dealing with
00348 a backward assertion. In that case, we have to find out the maximum amount to
00349 move back, and set up each alternative appropriately. */
00350 
00351 if (*first_op == OP_REVERSE)
00352   {
00353   int max_back = 0;
00354   int gone_back;
00355 
00356   end_code = this_start_code;
00357   do
00358     {
00359     int back = GET(end_code, 2+LINK_SIZE);
00360     if (back > max_back) max_back = back;
00361     end_code += GET(end_code, 1);
00362     }
00363   while (*end_code == OP_ALT);
00364 
00365   /* If we can't go back the amount required for the longest lookbehind
00366   pattern, go back as far as we can; some alternatives may still be viable. */
00367 
00368 #ifdef SUPPORT_UTF8
00369   /* In character mode we have to step back character by character */
00370 
00371   if (utf8)
00372     {
00373     for (gone_back = 0; gone_back < max_back; gone_back++)
00374       {
00375       if (current_subject <= start_subject) break;
00376       current_subject--;
00377       while (current_subject > start_subject &&
00378              (*current_subject & 0xc0) == 0x80)
00379         current_subject--;
00380       }
00381     }
00382   else
00383 #endif
00384 
00385   /* In byte-mode we can do this quickly. */
00386 
00387     {
00388     gone_back = (current_subject - max_back < start_subject)?
00389       current_subject - start_subject : max_back;
00390     current_subject -= gone_back;
00391     }
00392 
00393   /* Now we can process the individual branches. */
00394 
00395   end_code = this_start_code;
00396   do
00397     {
00398     int back = GET(end_code, 2+LINK_SIZE);
00399     if (back <= gone_back)
00400       {
00401       int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
00402       ADD_NEW_DATA(-bstate, 0, gone_back - back);
00403       }
00404     end_code += GET(end_code, 1);
00405     }
00406   while (*end_code == OP_ALT);
00407  }
00408 
00409 /* This is the code for a "normal" subpattern (not a backward assertion). The
00410 start of a whole pattern is always one of these. If we are at the top level,
00411 we may be asked to restart matching from the same point that we reached for a
00412 previous partial match. We still have to scan through the top-level branches to
00413 find the end state. */
00414 
00415 else
00416   {
00417   end_code = this_start_code;
00418 
00419   /* Restarting */
00420 
00421   if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
00422     {
00423     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
00424     new_count = workspace[1];
00425     if (!workspace[0])
00426       memcpy(new_states, active_states, new_count * sizeof(stateblock));
00427     }
00428 
00429   /* Not restarting */
00430 
00431   else
00432     {
00433     int length = 1 + LINK_SIZE +
00434       ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
00435     do
00436       {
00437       ADD_NEW(end_code - start_code + length, 0);
00438       end_code += GET(end_code, 1);
00439       length = 1 + LINK_SIZE;
00440       }
00441     while (*end_code == OP_ALT);
00442     }
00443   }
00444 
00445 workspace[0] = 0;    /* Bit indicating which vector is current */
00446 
00447 DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
00448 
00449 /* Loop for scanning the subject */
00450 
00451 ptr = current_subject;
00452 for (;;)
00453   {
00454   int i, j;
00455   int clen, dlen;
00456   unsigned int c, d;
00457 
00458   /* Make the new state list into the active state list and empty the
00459   new state list. */
00460 
00461   temp_states = active_states;
00462   active_states = new_states;
00463   new_states = temp_states;
00464   active_count = new_count;
00465   new_count = 0;
00466 
00467   workspace[0] ^= 1;              /* Remember for the restarting feature */
00468   workspace[1] = active_count;
00469 
00470 #ifdef DEBUG
00471   printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
00472   pchars((uschar *)ptr, strlen((char *)ptr), stdout);
00473   printf("\"\n");
00474 
00475   printf("%.*sActive states: ", rlevel*2-2, SP);
00476   for (i = 0; i < active_count; i++)
00477     printf("%d/%d ", active_states[i].offset, active_states[i].count);
00478   printf("\n");
00479 #endif
00480 
00481   /* Set the pointers for adding new states */
00482 
00483   next_active_state = active_states + active_count;
00484   next_new_state = new_states;
00485 
00486   /* Load the current character from the subject outside the loop, as many
00487   different states may want to look at it, and we assume that at least one
00488   will. */
00489 
00490   if (ptr < end_subject)
00491     {
00492     clen = 1;        /* Number of bytes in the character */
00493 #ifdef SUPPORT_UTF8
00494     if (utf8) { GETCHARLEN(c, ptr, clen); } else
00495 #endif  /* SUPPORT_UTF8 */
00496     c = *ptr;
00497     }
00498   else
00499     {
00500     clen = 0;        /* This indicates the end of the subject */
00501     c = NOTACHAR;    /* This value should never actually be used */
00502     }
00503 
00504   /* Scan up the active states and act on each one. The result of an action
00505   may be to add more states to the currently active list (e.g. on hitting a
00506   parenthesis) or it may be to put states on the new list, for considering
00507   when we move the character pointer on. */
00508 
00509   for (i = 0; i < active_count; i++)
00510     {
00511     stateblock *current_state = active_states + i;
00512     const uschar *code;
00513     int state_offset = current_state->offset;
00514     int count, codevalue;
00515 
00516 #ifdef DEBUG
00517     printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
00518     if (clen == 0) printf("EOL\n");
00519       else if (c > 32 && c < 127) printf("'%c'\n", c);
00520         else printf("0x%02x\n", c);
00521 #endif
00522 
00523     /* This variable is referred to implicity in the ADD_xxx macros. */
00524 
00525     ims = current_state->ims;
00526 
00527     /* A negative offset is a special case meaning "hold off going to this
00528     (negated) state until the number of characters in the data field have
00529     been skipped". */
00530 
00531     if (state_offset < 0)
00532       {
00533       if (current_state->data > 0)
00534         {
00535         DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
00536         ADD_NEW_DATA(state_offset, current_state->count,
00537           current_state->data - 1);
00538         continue;
00539         }
00540       else
00541         {
00542         current_state->offset = state_offset = -state_offset;
00543         }
00544       }
00545 
00546     /* Check for a duplicate state with the same count, and skip if found. */
00547 
00548     for (j = 0; j < i; j++)
00549       {
00550       if (active_states[j].offset == state_offset &&
00551           active_states[j].count == current_state->count)
00552         {
00553         DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
00554         goto NEXT_ACTIVE_STATE;
00555         }
00556       }
00557 
00558     /* The state offset is the offset to the opcode */
00559 
00560     code = start_code + state_offset;
00561     codevalue = *code;
00562 
00563     /* If this opcode is followed by an inline character, load it. It is
00564     tempting to test for the presence of a subject character here, but that
00565     is wrong, because sometimes zero repetitions of the subject are
00566     permitted.
00567 
00568     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
00569     argument that is not a data character - but is always one byte long. We
00570     have to take special action to deal with  \P, \p, \H, \h, \V, \v and \X in
00571     this case. To keep the other cases fast, convert these ones to new opcodes.
00572     */
00573 
00574     if (coptable[codevalue] > 0)
00575       {
00576       dlen = 1;
00577 #ifdef SUPPORT_UTF8
00578       if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
00579 #endif  /* SUPPORT_UTF8 */
00580       d = code[coptable[codevalue]];
00581       if (codevalue >= OP_TYPESTAR)
00582         {
00583         switch(d)
00584           {
00585           case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
00586           case OP_NOTPROP:
00587           case OP_PROP: codevalue += OP_PROP_EXTRA; break;
00588           case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
00589           case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
00590           case OP_NOT_HSPACE:
00591           case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
00592           case OP_NOT_VSPACE:
00593           case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
00594           default: break;
00595           }
00596         }
00597       }
00598     else
00599       {
00600       dlen = 0;         /* Not strictly necessary, but compilers moan */
00601       d = NOTACHAR;     /* if these variables are not set. */
00602       }
00603 
00604 
00605     /* Now process the individual opcodes */
00606 
00607     switch (codevalue)
00608       {
00609 
00610 /* ========================================================================== */
00611       /* Reached a closing bracket. If not at the end of the pattern, carry
00612       on with the next opcode. Otherwise, unless we have an empty string and
00613       PCRE_NOTEMPTY is set, save the match data, shifting up all previous
00614       matches so we always have the longest first. */
00615 
00616       case OP_KET:
00617       case OP_KETRMIN:
00618       case OP_KETRMAX:
00619       if (code != end_code)
00620         {
00621         ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
00622         if (codevalue != OP_KET)
00623           {
00624           ADD_ACTIVE(state_offset - GET(code, 1), 0);
00625           }
00626         }
00627       else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
00628         {
00629         if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
00630           else if (match_count > 0 && ++match_count * 2 >= offsetcount)
00631             match_count = 0;
00632         count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
00633         if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
00634         if (offsetcount >= 2)
00635           {
00636           offsets[0] = current_subject - start_subject;
00637           offsets[1] = ptr - start_subject;
00638           DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
00639             offsets[1] - offsets[0], current_subject));
00640           }
00641         if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
00642           {
00643           DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
00644             "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
00645             match_count, rlevel*2-2, SP));
00646           return match_count;
00647           }
00648         }
00649       break;
00650 
00651 /* ========================================================================== */
00652       /* These opcodes add to the current list of states without looking
00653       at the current character. */
00654 
00655       /*-----------------------------------------------------------------*/
00656       case OP_ALT:
00657       do { code += GET(code, 1); } while (*code == OP_ALT);
00658       ADD_ACTIVE(code - start_code, 0);
00659       break;
00660 
00661       /*-----------------------------------------------------------------*/
00662       case OP_BRA:
00663       case OP_SBRA:
00664       do
00665         {
00666         ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
00667         code += GET(code, 1);
00668         }
00669       while (*code == OP_ALT);
00670       break;
00671 
00672       /*-----------------------------------------------------------------*/
00673       case OP_CBRA:
00674       case OP_SCBRA:
00675       ADD_ACTIVE(code - start_code + 3 + LINK_SIZE,  0);
00676       code += GET(code, 1);
00677       while (*code == OP_ALT)
00678         {
00679         ADD_ACTIVE(code - start_code + 1 + LINK_SIZE,  0);
00680         code += GET(code, 1);
00681         }
00682       break;
00683 
00684       /*-----------------------------------------------------------------*/
00685       case OP_BRAZERO:
00686       case OP_BRAMINZERO:
00687       ADD_ACTIVE(state_offset + 1, 0);
00688       code += 1 + GET(code, 2);
00689       while (*code == OP_ALT) code += GET(code, 1);
00690       ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
00691       break;
00692 
00693       /*-----------------------------------------------------------------*/
00694       case OP_SKIPZERO:
00695       code += 1 + GET(code, 2);
00696       while (*code == OP_ALT) code += GET(code, 1);
00697       ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
00698       break;
00699 
00700       /*-----------------------------------------------------------------*/
00701       case OP_CIRC:
00702       if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
00703           ((ims & PCRE_MULTILINE) != 0 &&
00704             ptr != end_subject &&
00705             WAS_NEWLINE(ptr)))
00706         { ADD_ACTIVE(state_offset + 1, 0); }
00707       break;
00708 
00709       /*-----------------------------------------------------------------*/
00710       case OP_EOD:
00711       if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
00712       break;
00713 
00714       /*-----------------------------------------------------------------*/
00715       case OP_OPT:
00716       ims = code[1];
00717       ADD_ACTIVE(state_offset + 2, 0);
00718       break;
00719 
00720       /*-----------------------------------------------------------------*/
00721       case OP_SOD:
00722       if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
00723       break;
00724 
00725       /*-----------------------------------------------------------------*/
00726       case OP_SOM:
00727       if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
00728       break;
00729 
00730 
00731 /* ========================================================================== */
00732       /* These opcodes inspect the next subject character, and sometimes
00733       the previous one as well, but do not have an argument. The variable
00734       clen contains the length of the current character and is zero if we are
00735       at the end of the subject. */
00736 
00737       /*-----------------------------------------------------------------*/
00738       case OP_ANY:
00739       if (clen > 0 && !IS_NEWLINE(ptr))
00740         { ADD_NEW(state_offset + 1, 0); }
00741       break;
00742 
00743       /*-----------------------------------------------------------------*/
00744       case OP_ALLANY:
00745       if (clen > 0)
00746         { ADD_NEW(state_offset + 1, 0); }
00747       break;
00748 
00749       /*-----------------------------------------------------------------*/
00750       case OP_EODN:
00751       if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
00752         { ADD_ACTIVE(state_offset + 1, 0); }
00753       break;
00754 
00755       /*-----------------------------------------------------------------*/
00756       case OP_DOLL:
00757       if ((md->moptions & PCRE_NOTEOL) == 0)
00758         {
00759         if (clen == 0 ||
00760             (IS_NEWLINE(ptr) &&
00761                ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
00762             ))
00763           { ADD_ACTIVE(state_offset + 1, 0); }
00764         }
00765       else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
00766         { ADD_ACTIVE(state_offset + 1, 0); }
00767       break;
00768 
00769       /*-----------------------------------------------------------------*/
00770 
00771       case OP_DIGIT:
00772       case OP_WHITESPACE:
00773       case OP_WORDCHAR:
00774       if (clen > 0 && c < 256 &&
00775             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
00776         { ADD_NEW(state_offset + 1, 0); }
00777       break;
00778 
00779       /*-----------------------------------------------------------------*/
00780       case OP_NOT_DIGIT:
00781       case OP_NOT_WHITESPACE:
00782       case OP_NOT_WORDCHAR:
00783       if (clen > 0 && (c >= 256 ||
00784             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
00785         { ADD_NEW(state_offset + 1, 0); }
00786       break;
00787 
00788       /*-----------------------------------------------------------------*/
00789       case OP_WORD_BOUNDARY:
00790       case OP_NOT_WORD_BOUNDARY:
00791         {
00792         int left_word, right_word;
00793 
00794         if (ptr > start_subject)
00795           {
00796           const uschar *temp = ptr - 1;
00797 #ifdef SUPPORT_UTF8
00798           if (utf8) BACKCHAR(temp);
00799 #endif
00800           GETCHARTEST(d, temp);
00801           left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
00802           }
00803         else left_word = 0;
00804 
00805         if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
00806           else right_word = 0;
00807 
00808         if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
00809           { ADD_ACTIVE(state_offset + 1, 0); }
00810         }
00811       break;
00812 
00813 
00814       /*-----------------------------------------------------------------*/
00815       /* Check the next character by Unicode property. We will get here only
00816       if the support is in the binary; otherwise a compile-time error occurs.
00817       */
00818 
00819 #ifdef SUPPORT_UCP
00820       case OP_PROP:
00821       case OP_NOTPROP:
00822       if (clen > 0)
00823         {
00824         BOOL OK;
00825         const ucd_record * prop = GET_UCD(c);
00826         switch(code[1])
00827           {
00828           case PT_ANY:
00829           OK = TRUE;
00830           break;
00831 
00832           case PT_LAMP:
00833           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
00834           break;
00835 
00836           case PT_GC:
00837           OK = _pcre_ucp_gentype[prop->chartype] == code[2];
00838           break;
00839 
00840           case PT_PC:
00841           OK = prop->chartype == code[2];
00842           break;
00843 
00844           case PT_SC:
00845           OK = prop->script == code[2];
00846           break;
00847 
00848           /* Should never occur, but keep compilers from grumbling. */
00849 
00850           default:
00851           OK = codevalue != OP_PROP;
00852           break;
00853           }
00854 
00855         if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
00856         }
00857       break;
00858 #endif
00859 
00860 
00861 
00862 /* ========================================================================== */
00863       /* These opcodes likewise inspect the subject character, but have an
00864       argument that is not a data character. It is one of these opcodes:
00865       OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
00866       OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
00867 
00868       case OP_TYPEPLUS:
00869       case OP_TYPEMINPLUS:
00870       case OP_TYPEPOSPLUS:
00871       count = current_state->count;  /* Already matched */
00872       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
00873       if (clen > 0)
00874         {
00875         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
00876             (c < 256 &&
00877               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
00878               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
00879           {
00880           if (count > 0 && codevalue == OP_TYPEPOSPLUS)
00881             {
00882             active_count--;            /* Remove non-match possibility */
00883             next_active_state--;
00884             }
00885           count++;
00886           ADD_NEW(state_offset, count);
00887           }
00888         }
00889       break;
00890 
00891       /*-----------------------------------------------------------------*/
00892       case OP_TYPEQUERY:
00893       case OP_TYPEMINQUERY:
00894       case OP_TYPEPOSQUERY:
00895       ADD_ACTIVE(state_offset + 2, 0);
00896       if (clen > 0)
00897         {
00898         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
00899             (c < 256 &&
00900               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
00901               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
00902           {
00903           if (codevalue == OP_TYPEPOSQUERY)
00904             {
00905             active_count--;            /* Remove non-match possibility */
00906             next_active_state--;
00907             }
00908           ADD_NEW(state_offset + 2, 0);
00909           }
00910         }
00911       break;
00912 
00913       /*-----------------------------------------------------------------*/
00914       case OP_TYPESTAR:
00915       case OP_TYPEMINSTAR:
00916       case OP_TYPEPOSSTAR:
00917       ADD_ACTIVE(state_offset + 2, 0);
00918       if (clen > 0)
00919         {
00920         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
00921             (c < 256 &&
00922               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
00923               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
00924           {
00925           if (codevalue == OP_TYPEPOSSTAR)
00926             {
00927             active_count--;            /* Remove non-match possibility */
00928             next_active_state--;
00929             }
00930           ADD_NEW(state_offset, 0);
00931           }
00932         }
00933       break;
00934 
00935       /*-----------------------------------------------------------------*/
00936       case OP_TYPEEXACT:
00937       count = current_state->count;  /* Number already matched */
00938       if (clen > 0)
00939         {
00940         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
00941             (c < 256 &&
00942               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
00943               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
00944           {
00945           if (++count >= GET2(code, 1))
00946             { ADD_NEW(state_offset + 4, 0); }
00947           else
00948             { ADD_NEW(state_offset, count); }
00949           }
00950         }
00951       break;
00952 
00953       /*-----------------------------------------------------------------*/
00954       case OP_TYPEUPTO:
00955       case OP_TYPEMINUPTO:
00956       case OP_TYPEPOSUPTO:
00957       ADD_ACTIVE(state_offset + 4, 0);
00958       count = current_state->count;  /* Number already matched */
00959       if (clen > 0)
00960         {
00961         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
00962             (c < 256 &&
00963               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
00964               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
00965           {
00966           if (codevalue == OP_TYPEPOSUPTO)
00967             {
00968             active_count--;           /* Remove non-match possibility */
00969             next_active_state--;
00970             }
00971           if (++count >= GET2(code, 1))
00972             { ADD_NEW(state_offset + 4, 0); }
00973           else
00974             { ADD_NEW(state_offset, count); }
00975           }
00976         }
00977       break;
00978 
00979 /* ========================================================================== */
00980       /* These are virtual opcodes that are used when something like
00981       OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
00982       argument. It keeps the code above fast for the other cases. The argument
00983       is in the d variable. */
00984 
00985 #ifdef SUPPORT_UCP
00986       case OP_PROP_EXTRA + OP_TYPEPLUS:
00987       case OP_PROP_EXTRA + OP_TYPEMINPLUS:
00988       case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
00989       count = current_state->count;           /* Already matched */
00990       if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
00991       if (clen > 0)
00992         {
00993         BOOL OK;
00994         const ucd_record * prop = GET_UCD(c);
00995         switch(code[2])
00996           {
00997           case PT_ANY:
00998           OK = TRUE;
00999           break;
01000 
01001           case PT_LAMP:
01002           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
01003           break;
01004 
01005           case PT_GC:
01006           OK = _pcre_ucp_gentype[prop->chartype] == code[3];
01007           break;
01008 
01009           case PT_PC:
01010           OK = prop->chartype == code[3];
01011           break;
01012 
01013           case PT_SC:
01014           OK = prop->script == code[3];
01015           break;
01016 
01017           /* Should never occur, but keep compilers from grumbling. */
01018 
01019           default:
01020           OK = codevalue != OP_PROP;
01021           break;
01022           }
01023 
01024         if (OK == (d == OP_PROP))
01025           {
01026           if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
01027             {
01028             active_count--;           /* Remove non-match possibility */
01029             next_active_state--;
01030             }
01031           count++;
01032           ADD_NEW(state_offset, count);
01033           }
01034         }
01035       break;
01036 
01037       /*-----------------------------------------------------------------*/
01038       case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
01039       case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
01040       case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
01041       count = current_state->count;  /* Already matched */
01042       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
01043       if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
01044         {
01045         const uschar *nptr = ptr + clen;
01046         int ncount = 0;
01047         if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
01048           {
01049           active_count--;           /* Remove non-match possibility */
01050           next_active_state--;
01051           }
01052         while (nptr < end_subject)
01053           {
01054           int nd;
01055           int ndlen = 1;
01056           GETCHARLEN(nd, nptr, ndlen);
01057           if (UCD_CATEGORY(nd) != ucp_M) break;
01058           ncount++;
01059           nptr += ndlen;
01060           }
01061         count++;
01062         ADD_NEW_DATA(-state_offset, count, ncount);
01063         }
01064       break;
01065 #endif
01066 
01067       /*-----------------------------------------------------------------*/
01068       case OP_ANYNL_EXTRA + OP_TYPEPLUS:
01069       case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
01070       case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
01071       count = current_state->count;  /* Already matched */
01072       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
01073       if (clen > 0)
01074         {
01075         int ncount = 0;
01076         switch (c)
01077           {
01078           case 0x000b:
01079           case 0x000c:
01080           case 0x0085:
01081           case 0x2028:
01082           case 0x2029:
01083           if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
01084           goto ANYNL01;
01085 
01086           case 0x000d:
01087           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
01088           /* Fall through */
01089 
01090           ANYNL01:
01091           case 0x000a:
01092           if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
01093             {
01094             active_count--;           /* Remove non-match possibility */
01095             next_active_state--;
01096             }
01097           count++;
01098           ADD_NEW_DATA(-state_offset, count, ncount);
01099           break;
01100 
01101           default:
01102           break;
01103           }
01104         }
01105       break;
01106 
01107       /*-----------------------------------------------------------------*/
01108       case OP_VSPACE_EXTRA + OP_TYPEPLUS:
01109       case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
01110       case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
01111       count = current_state->count;  /* Already matched */
01112       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
01113       if (clen > 0)
01114         {
01115         BOOL OK;
01116         switch (c)
01117           {
01118           case 0x000a:
01119           case 0x000b:
01120           case 0x000c:
01121           case 0x000d:
01122           case 0x0085:
01123           case 0x2028:
01124           case 0x2029:
01125           OK = TRUE;
01126           break;
01127 
01128           default:
01129           OK = FALSE;
01130           break;
01131           }
01132 
01133         if (OK == (d == OP_VSPACE))
01134           {
01135           if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
01136             {
01137             active_count--;           /* Remove non-match possibility */
01138             next_active_state--;
01139             }
01140           count++;
01141           ADD_NEW_DATA(-state_offset, count, 0);
01142           }
01143         }
01144       break;
01145 
01146       /*-----------------------------------------------------------------*/
01147       case OP_HSPACE_EXTRA + OP_TYPEPLUS:
01148       case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
01149       case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
01150       count = current_state->count;  /* Already matched */
01151       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
01152       if (clen > 0)
01153         {
01154         BOOL OK;
01155         switch (c)
01156           {
01157           case 0x09:      /* HT */
01158           case 0x20:      /* SPACE */
01159           case 0xa0:      /* NBSP */
01160           case 0x1680:    /* OGHAM SPACE MARK */
01161           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
01162           case 0x2000:    /* EN QUAD */
01163           case 0x2001:    /* EM QUAD */
01164           case 0x2002:    /* EN SPACE */
01165           case 0x2003:    /* EM SPACE */
01166           case 0x2004:    /* THREE-PER-EM SPACE */
01167           case 0x2005:    /* FOUR-PER-EM SPACE */
01168           case 0x2006:    /* SIX-PER-EM SPACE */
01169           case 0x2007:    /* FIGURE SPACE */
01170           case 0x2008:    /* PUNCTUATION SPACE */
01171           case 0x2009:    /* THIN SPACE */
01172           case 0x200A:    /* HAIR SPACE */
01173           case 0x202f:    /* NARROW NO-BREAK SPACE */
01174           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
01175           case 0x3000:    /* IDEOGRAPHIC SPACE */
01176           OK = TRUE;
01177           break;
01178 
01179           default:
01180           OK = FALSE;
01181           break;
01182           }
01183 
01184         if (OK == (d == OP_HSPACE))
01185           {
01186           if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
01187             {
01188             active_count--;           /* Remove non-match possibility */
01189             next_active_state--;
01190             }
01191           count++;
01192           ADD_NEW_DATA(-state_offset, count, 0);
01193           }
01194         }
01195       break;
01196 
01197       /*-----------------------------------------------------------------*/
01198 #ifdef SUPPORT_UCP
01199       case OP_PROP_EXTRA + OP_TYPEQUERY:
01200       case OP_PROP_EXTRA + OP_TYPEMINQUERY:
01201       case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
01202       count = 4;
01203       goto QS1;
01204 
01205       case OP_PROP_EXTRA + OP_TYPESTAR:
01206       case OP_PROP_EXTRA + OP_TYPEMINSTAR:
01207       case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
01208       count = 0;
01209 
01210       QS1:
01211 
01212       ADD_ACTIVE(state_offset + 4, 0);
01213       if (clen > 0)
01214         {
01215         BOOL OK;
01216         const ucd_record * prop = GET_UCD(c);
01217         switch(code[2])
01218           {
01219           case PT_ANY:
01220           OK = TRUE;
01221           break;
01222 
01223           case PT_LAMP:
01224           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
01225           break;
01226 
01227           case PT_GC:
01228           OK = _pcre_ucp_gentype[prop->chartype] == code[3];
01229           break;
01230 
01231           case PT_PC:
01232           OK = prop->chartype == code[3];
01233           break;
01234 
01235           case PT_SC:
01236           OK = prop->script == code[3];
01237           break;
01238 
01239           /* Should never occur, but keep compilers from grumbling. */
01240 
01241           default:
01242           OK = codevalue != OP_PROP;
01243           break;
01244           }
01245 
01246         if (OK == (d == OP_PROP))
01247           {
01248           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
01249               codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
01250             {
01251             active_count--;           /* Remove non-match possibility */
01252             next_active_state--;
01253             }
01254           ADD_NEW(state_offset + count, 0);
01255           }
01256         }
01257       break;
01258 
01259       /*-----------------------------------------------------------------*/
01260       case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
01261       case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
01262       case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
01263       count = 2;
01264       goto QS2;
01265 
01266       case OP_EXTUNI_EXTRA + OP_TYPESTAR:
01267       case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
01268       case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
01269       count = 0;
01270 
01271       QS2:
01272 
01273       ADD_ACTIVE(state_offset + 2, 0);
01274       if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
01275         {
01276         const uschar *nptr = ptr + clen;
01277         int ncount = 0;
01278         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
01279             codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
01280           {
01281           active_count--;           /* Remove non-match possibility */
01282           next_active_state--;
01283           }
01284         while (nptr < end_subject)
01285           {
01286           int nd;
01287           int ndlen = 1;
01288           GETCHARLEN(nd, nptr, ndlen);
01289           if (UCD_CATEGORY(nd) != ucp_M) break;
01290           ncount++;
01291           nptr += ndlen;
01292           }
01293         ADD_NEW_DATA(-(state_offset + count), 0, ncount);
01294         }
01295       break;
01296 #endif
01297 
01298       /*-----------------------------------------------------------------*/
01299       case OP_ANYNL_EXTRA + OP_TYPEQUERY:
01300       case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
01301       case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
01302       count = 2;
01303       goto QS3;
01304 
01305       case OP_ANYNL_EXTRA + OP_TYPESTAR:
01306       case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
01307       case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
01308       count = 0;
01309 
01310       QS3:
01311       ADD_ACTIVE(state_offset + 2, 0);
01312       if (clen > 0)
01313         {
01314         int ncount = 0;
01315         switch (c)
01316           {
01317           case 0x000b:
01318           case 0x000c:
01319           case 0x0085:
01320           case 0x2028:
01321           case 0x2029:
01322           if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
01323           goto ANYNL02;
01324 
01325           case 0x000d:
01326           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
01327           /* Fall through */
01328 
01329           ANYNL02:
01330           case 0x000a:
01331           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
01332               codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
01333             {
01334             active_count--;           /* Remove non-match possibility */
01335             next_active_state--;
01336             }
01337           ADD_NEW_DATA(-(state_offset + count), 0, ncount);
01338           break;
01339 
01340           default:
01341           break;
01342           }
01343         }
01344       break;
01345 
01346       /*-----------------------------------------------------------------*/
01347       case OP_VSPACE_EXTRA + OP_TYPEQUERY:
01348       case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
01349       case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
01350       count = 2;
01351       goto QS4;
01352 
01353       case OP_VSPACE_EXTRA + OP_TYPESTAR:
01354       case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
01355       case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
01356       count = 0;
01357 
01358       QS4:
01359       ADD_ACTIVE(state_offset + 2, 0);
01360       if (clen > 0)
01361         {
01362         BOOL OK;
01363         switch (c)
01364           {
01365           case 0x000a:
01366           case 0x000b:
01367           case 0x000c:
01368           case 0x000d:
01369           case 0x0085:
01370           case 0x2028:
01371           case 0x2029:
01372           OK = TRUE;
01373           break;
01374 
01375           default:
01376           OK = FALSE;
01377           break;
01378           }
01379         if (OK == (d == OP_VSPACE))
01380           {
01381           if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
01382               codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
01383             {
01384             active_count--;           /* Remove non-match possibility */
01385             next_active_state--;
01386             }
01387           ADD_NEW_DATA(-(state_offset + count), 0, 0);
01388           }
01389         }
01390       break;
01391 
01392       /*-----------------------------------------------------------------*/
01393       case OP_HSPACE_EXTRA + OP_TYPEQUERY:
01394       case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
01395       case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
01396       count = 2;
01397       goto QS5;
01398 
01399       case OP_HSPACE_EXTRA + OP_TYPESTAR:
01400       case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
01401       case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
01402       count = 0;
01403 
01404       QS5:
01405       ADD_ACTIVE(state_offset + 2, 0);
01406       if (clen > 0)
01407         {
01408         BOOL OK;
01409         switch (c)
01410           {
01411           case 0x09:      /* HT */
01412           case 0x20:      /* SPACE */
01413           case 0xa0:      /* NBSP */
01414           case 0x1680:    /* OGHAM SPACE MARK */
01415           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
01416           case 0x2000:    /* EN QUAD */
01417           case 0x2001:    /* EM QUAD */
01418           case 0x2002:    /* EN SPACE */
01419           case 0x2003:    /* EM SPACE */
01420           case 0x2004:    /* THREE-PER-EM SPACE */
01421           case 0x2005:    /* FOUR-PER-EM SPACE */
01422           case 0x2006:    /* SIX-PER-EM SPACE */
01423           case 0x2007:    /* FIGURE SPACE */
01424           case 0x2008:    /* PUNCTUATION SPACE */
01425           case 0x2009:    /* THIN SPACE */
01426           case 0x200A:    /* HAIR SPACE */
01427           case 0x202f:    /* NARROW NO-BREAK SPACE */
01428           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
01429           case 0x3000:    /* IDEOGRAPHIC SPACE */
01430           OK = TRUE;
01431           break;
01432 
01433           default:
01434           OK = FALSE;
01435           break;
01436           }
01437 
01438         if (OK == (d == OP_HSPACE))
01439           {
01440           if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
01441               codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
01442             {
01443             active_count--;           /* Remove non-match possibility */
01444             next_active_state--;
01445             }
01446           ADD_NEW_DATA(-(state_offset + count), 0, 0);
01447           }
01448         }
01449       break;
01450 
01451       /*-----------------------------------------------------------------*/
01452 #ifdef SUPPORT_UCP
01453       case OP_PROP_EXTRA + OP_TYPEEXACT:
01454       case OP_PROP_EXTRA + OP_TYPEUPTO:
01455       case OP_PROP_EXTRA + OP_TYPEMINUPTO:
01456       case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
01457       if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
01458         { ADD_ACTIVE(state_offset + 6, 0); }
01459       count = current_state->count;  /* Number already matched */
01460       if (clen > 0)
01461         {
01462         BOOL OK;
01463         const ucd_record * prop = GET_UCD(c);
01464         switch(code[4])
01465           {
01466           case PT_ANY:
01467           OK = TRUE;
01468           break;
01469 
01470           case PT_LAMP:
01471           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
01472           break;
01473 
01474           case PT_GC:
01475           OK = _pcre_ucp_gentype[prop->chartype] == code[5];
01476           break;
01477 
01478           case PT_PC:
01479           OK = prop->chartype == code[5];
01480           break;
01481 
01482           case PT_SC:
01483           OK = prop->script == code[5];
01484           break;
01485 
01486           /* Should never occur, but keep compilers from grumbling. */
01487 
01488           default:
01489           OK = codevalue != OP_PROP;
01490           break;
01491           }
01492 
01493         if (OK == (d == OP_PROP))
01494           {
01495           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
01496             {
01497             active_count--;           /* Remove non-match possibility */
01498             next_active_state--;
01499             }
01500           if (++count >= GET2(code, 1))
01501             { ADD_NEW(state_offset + 6, 0); }
01502           else
01503             { ADD_NEW(state_offset, count); }
01504           }
01505         }
01506       break;
01507 
01508       /*-----------------------------------------------------------------*/
01509       case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
01510       case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
01511       case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
01512       case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
01513       if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
01514         { ADD_ACTIVE(state_offset + 4, 0); }
01515       count = current_state->count;  /* Number already matched */
01516       if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
01517         {
01518         const uschar *nptr = ptr + clen;
01519         int ncount = 0;
01520         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
01521           {
01522           active_count--;           /* Remove non-match possibility */
01523           next_active_state--;
01524           }
01525         while (nptr < end_subject)
01526           {
01527           int nd;
01528           int ndlen = 1;
01529           GETCHARLEN(nd, nptr, ndlen);
01530           if (UCD_CATEGORY(nd) != ucp_M) break;
01531           ncount++;
01532           nptr += ndlen;
01533           }
01534         if (++count >= GET2(code, 1))
01535           { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
01536         else
01537           { ADD_NEW_DATA(-state_offset, count, ncount); }
01538         }
01539       break;
01540 #endif
01541 
01542       /*-----------------------------------------------------------------*/
01543       case OP_ANYNL_EXTRA + OP_TYPEEXACT:
01544       case OP_ANYNL_EXTRA + OP_TYPEUPTO:
01545       case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
01546       case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
01547       if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
01548         { ADD_ACTIVE(state_offset + 4, 0); }
01549       count = current_state->count;  /* Number already matched */
01550       if (clen > 0)
01551         {
01552         int ncount = 0;
01553         switch (c)
01554           {
01555           case 0x000b:
01556           case 0x000c:
01557           case 0x0085:
01558           case 0x2028:
01559           case 0x2029:
01560           if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
01561           goto ANYNL03;
01562 
01563           case 0x000d:
01564           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
01565           /* Fall through */
01566 
01567           ANYNL03:
01568           case 0x000a:
01569           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
01570             {
01571             active_count--;           /* Remove non-match possibility */
01572             next_active_state--;
01573             }
01574           if (++count >= GET2(code, 1))
01575             { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
01576           else
01577             { ADD_NEW_DATA(-state_offset, count, ncount); }
01578           break;
01579 
01580           default:
01581           break;
01582           }
01583         }
01584       break;
01585 
01586       /*-----------------------------------------------------------------*/
01587       case OP_VSPACE_EXTRA + OP_TYPEEXACT:
01588       case OP_VSPACE_EXTRA + OP_TYPEUPTO:
01589       case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
01590       case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
01591       if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
01592         { ADD_ACTIVE(state_offset + 4, 0); }
01593       count = current_state->count;  /* Number already matched */
01594       if (clen > 0)
01595         {
01596         BOOL OK;
01597         switch (c)
01598           {
01599           case 0x000a:
01600           case 0x000b:
01601           case 0x000c:
01602           case 0x000d:
01603           case 0x0085:
01604           case 0x2028:
01605           case 0x2029:
01606           OK = TRUE;
01607           break;
01608 
01609           default:
01610           OK = FALSE;
01611           }
01612 
01613         if (OK == (d == OP_VSPACE))
01614           {
01615           if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
01616             {
01617             active_count--;           /* Remove non-match possibility */
01618             next_active_state--;
01619             }
01620           if (++count >= GET2(code, 1))
01621             { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
01622           else
01623             { ADD_NEW_DATA(-state_offset, count, 0); }
01624           }
01625         }
01626       break;
01627 
01628       /*-----------------------------------------------------------------*/
01629       case OP_HSPACE_EXTRA + OP_TYPEEXACT:
01630       case OP_HSPACE_EXTRA + OP_TYPEUPTO:
01631       case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
01632       case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
01633       if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
01634         { ADD_ACTIVE(state_offset + 4, 0); }
01635       count = current_state->count;  /* Number already matched */
01636       if (clen > 0)
01637         {
01638         BOOL OK;
01639         switch (c)
01640           {
01641           case 0x09:      /* HT */
01642           case 0x20:      /* SPACE */
01643           case 0xa0:      /* NBSP */
01644           case 0x1680:    /* OGHAM SPACE MARK */
01645           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
01646           case 0x2000:    /* EN QUAD */
01647           case 0x2001:    /* EM QUAD */
01648           case 0x2002:    /* EN SPACE */
01649           case 0x2003:    /* EM SPACE */
01650           case 0x2004:    /* THREE-PER-EM SPACE */
01651           case 0x2005:    /* FOUR-PER-EM SPACE */
01652           case 0x2006:    /* SIX-PER-EM SPACE */
01653           case 0x2007:    /* FIGURE SPACE */
01654           case 0x2008:    /* PUNCTUATION SPACE */
01655           case 0x2009:    /* THIN SPACE */
01656           case 0x200A:    /* HAIR SPACE */
01657           case 0x202f:    /* NARROW NO-BREAK SPACE */
01658           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
01659           case 0x3000:    /* IDEOGRAPHIC SPACE */
01660           OK = TRUE;
01661           break;
01662 
01663           default:
01664           OK = FALSE;
01665           break;
01666           }
01667 
01668         if (OK == (d == OP_HSPACE))
01669           {
01670           if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
01671             {
01672             active_count--;           /* Remove non-match possibility */
01673             next_active_state--;
01674             }
01675           if (++count >= GET2(code, 1))
01676             { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
01677           else
01678             { ADD_NEW_DATA(-state_offset, count, 0); }
01679           }
01680         }
01681       break;
01682 
01683 /* ========================================================================== */
01684       /* These opcodes are followed by a character that is usually compared
01685       to the current subject character; it is loaded into d. We still get
01686       here even if there is no subject character, because in some cases zero
01687       repetitions are permitted. */
01688 
01689       /*-----------------------------------------------------------------*/
01690       case OP_CHAR:
01691       if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
01692       break;
01693 
01694       /*-----------------------------------------------------------------*/
01695       case OP_CHARNC:
01696       if (clen == 0) break;
01697 
01698 #ifdef SUPPORT_UTF8
01699       if (utf8)
01700         {
01701         if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
01702           {
01703           unsigned int othercase;
01704           if (c < 128) othercase = fcc[c]; else
01705 
01706           /* If we have Unicode property support, we can use it to test the
01707           other case of the character. */
01708 
01709 #ifdef SUPPORT_UCP
01710           othercase = UCD_OTHERCASE(c);
01711 #else
01712           othercase = NOTACHAR;
01713 #endif
01714 
01715           if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
01716           }
01717         }
01718       else
01719 #endif  /* SUPPORT_UTF8 */
01720 
01721       /* Non-UTF-8 mode */
01722         {
01723         if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
01724         }
01725       break;
01726 
01727 
01728 #ifdef SUPPORT_UCP
01729       /*-----------------------------------------------------------------*/
01730       /* This is a tricky one because it can match more than one character.
01731       Find out how many characters to skip, and then set up a negative state
01732       to wait for them to pass before continuing. */
01733 
01734       case OP_EXTUNI:
01735       if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
01736         {
01737         const uschar *nptr = ptr + clen;
01738         int ncount = 0;
01739         while (nptr < end_subject)
01740           {
01741           int nclen = 1;
01742           GETCHARLEN(c, nptr, nclen);
01743           if (UCD_CATEGORY(c) != ucp_M) break;
01744           ncount++;
01745           nptr += nclen;
01746           }
01747         ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
01748         }
01749       break;
01750 #endif
01751 
01752       /*-----------------------------------------------------------------*/
01753       /* This is a tricky like EXTUNI because it too can match more than one
01754       character (when CR is followed by LF). In this case, set up a negative
01755       state to wait for one character to pass before continuing. */
01756 
01757       case OP_ANYNL:
01758       if (clen > 0) switch(c)
01759         {
01760         case 0x000b:
01761         case 0x000c:
01762         case 0x0085:
01763         case 0x2028:
01764         case 0x2029:
01765         if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
01766 
01767         case 0x000a:
01768         ADD_NEW(state_offset + 1, 0);
01769         break;
01770 
01771         case 0x000d:
01772         if (ptr + 1 < end_subject && ptr[1] == 0x0a)
01773           {
01774           ADD_NEW_DATA(-(state_offset + 1), 0, 1);
01775           }
01776         else
01777           {
01778           ADD_NEW(state_offset + 1, 0);
01779           }
01780         break;
01781         }
01782       break;
01783 
01784       /*-----------------------------------------------------------------*/
01785       case OP_NOT_VSPACE:
01786       if (clen > 0) switch(c)
01787         {
01788         case 0x000a:
01789         case 0x000b:
01790         case 0x000c:
01791         case 0x000d:
01792         case 0x0085:
01793         case 0x2028:
01794         case 0x2029:
01795         break;
01796 
01797         default:
01798         ADD_NEW(state_offset + 1, 0);
01799         break;
01800         }
01801       break;
01802 
01803       /*-----------------------------------------------------------------*/
01804       case OP_VSPACE:
01805       if (clen > 0) switch(c)
01806         {
01807         case 0x000a:
01808         case 0x000b:
01809         case 0x000c:
01810         case 0x000d:
01811         case 0x0085:
01812         case 0x2028:
01813         case 0x2029:
01814         ADD_NEW(state_offset + 1, 0);
01815         break;
01816 
01817         default: break;
01818         }
01819       break;
01820 
01821       /*-----------------------------------------------------------------*/
01822       case OP_NOT_HSPACE:
01823       if (clen > 0) switch(c)
01824         {
01825         case 0x09:      /* HT */
01826         case 0x20:      /* SPACE */
01827         case 0xa0:      /* NBSP */
01828         case 0x1680:    /* OGHAM SPACE MARK */
01829         case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
01830         case 0x2000:    /* EN QUAD */
01831         case 0x2001:    /* EM QUAD */
01832         case 0x2002:    /* EN SPACE */
01833         case 0x2003:    /* EM SPACE */
01834         case 0x2004:    /* THREE-PER-EM SPACE */
01835         case 0x2005:    /* FOUR-PER-EM SPACE */
01836         case 0x2006:    /* SIX-PER-EM SPACE */
01837         case 0x2007:    /* FIGURE SPACE */
01838         case 0x2008:    /* PUNCTUATION SPACE */
01839         case 0x2009:    /* THIN SPACE */
01840         case 0x200A:    /* HAIR SPACE */
01841         case 0x202f:    /* NARROW NO-BREAK SPACE */
01842         case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
01843         case 0x3000:    /* IDEOGRAPHIC SPACE */
01844         break;
01845 
01846         default:
01847         ADD_NEW(state_offset + 1, 0);
01848         break;
01849         }
01850       break;
01851 
01852       /*-----------------------------------------------------------------*/
01853       case OP_HSPACE:
01854       if (clen > 0) switch(c)
01855         {
01856         case 0x09:      /* HT */
01857         case 0x20:      /* SPACE */
01858         case 0xa0:      /* NBSP */
01859         case 0x1680:    /* OGHAM SPACE MARK */
01860         case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
01861         case 0x2000:    /* EN QUAD */
01862         case 0x2001:    /* EM QUAD */
01863         case 0x2002:    /* EN SPACE */
01864         case 0x2003:    /* EM SPACE */
01865         case 0x2004:    /* THREE-PER-EM SPACE */
01866         case 0x2005:    /* FOUR-PER-EM SPACE */
01867         case 0x2006:    /* SIX-PER-EM SPACE */
01868         case 0x2007:    /* FIGURE SPACE */
01869         case 0x2008:    /* PUNCTUATION SPACE */
01870         case 0x2009:    /* THIN SPACE */
01871         case 0x200A:    /* HAIR SPACE */
01872         case 0x202f:    /* NARROW NO-BREAK SPACE */
01873         case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
01874         case 0x3000:    /* IDEOGRAPHIC SPACE */
01875         ADD_NEW(state_offset + 1, 0);
01876         break;
01877         }
01878       break;
01879 
01880       /*-----------------------------------------------------------------*/
01881       /* Match a negated single character. This is only used for one-byte
01882       characters, that is, we know that d < 256. The character we are
01883       checking (c) can be multibyte. */
01884 
01885       case OP_NOT:
01886       if (clen > 0)
01887         {
01888         unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
01889         if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
01890         }
01891       break;
01892 
01893       /*-----------------------------------------------------------------*/
01894       case OP_PLUS:
01895       case OP_MINPLUS:
01896       case OP_POSPLUS:
01897       case OP_NOTPLUS:
01898       case OP_NOTMINPLUS:
01899       case OP_NOTPOSPLUS:
01900       count = current_state->count;  /* Already matched */
01901       if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
01902       if (clen > 0)
01903         {
01904         unsigned int otherd = NOTACHAR;
01905         if ((ims & PCRE_CASELESS) != 0)
01906           {
01907 #ifdef SUPPORT_UTF8
01908           if (utf8 && d >= 128)
01909             {
01910 #ifdef SUPPORT_UCP
01911             otherd = UCD_OTHERCASE(d);
01912 #endif  /* SUPPORT_UCP */
01913             }
01914           else
01915 #endif  /* SUPPORT_UTF8 */
01916           otherd = fcc[d];
01917           }
01918         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
01919           {
01920           if (count > 0 &&
01921               (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
01922             {
01923             active_count--;             /* Remove non-match possibility */
01924             next_active_state--;
01925             }
01926           count++;
01927           ADD_NEW(state_offset, count);
01928           }
01929         }
01930       break;
01931 
01932       /*-----------------------------------------------------------------*/
01933       case OP_QUERY:
01934       case OP_MINQUERY:
01935       case OP_POSQUERY:
01936       case OP_NOTQUERY:
01937       case OP_NOTMINQUERY:
01938       case OP_NOTPOSQUERY:
01939       ADD_ACTIVE(state_offset + dlen + 1, 0);
01940       if (clen > 0)
01941         {
01942         unsigned int otherd = NOTACHAR;
01943         if ((ims & PCRE_CASELESS) != 0)
01944           {
01945 #ifdef SUPPORT_UTF8
01946           if (utf8 && d >= 128)
01947             {
01948 #ifdef SUPPORT_UCP
01949             otherd = UCD_OTHERCASE(d);
01950 #endif  /* SUPPORT_UCP */
01951             }
01952           else
01953 #endif  /* SUPPORT_UTF8 */
01954           otherd = fcc[d];
01955           }
01956         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
01957           {
01958           if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
01959             {
01960             active_count--;            /* Remove non-match possibility */
01961             next_active_state--;
01962             }
01963           ADD_NEW(state_offset + dlen + 1, 0);
01964           }
01965         }
01966       break;
01967 
01968       /*-----------------------------------------------------------------*/
01969       case OP_STAR:
01970       case OP_MINSTAR:
01971       case OP_POSSTAR:
01972       case OP_NOTSTAR:
01973       case OP_NOTMINSTAR:
01974       case OP_NOTPOSSTAR:
01975       ADD_ACTIVE(state_offset + dlen + 1, 0);
01976       if (clen > 0)
01977         {
01978         unsigned int otherd = NOTACHAR;
01979         if ((ims & PCRE_CASELESS) != 0)
01980           {
01981 #ifdef SUPPORT_UTF8
01982           if (utf8 && d >= 128)
01983             {
01984 #ifdef SUPPORT_UCP
01985             otherd = UCD_OTHERCASE(d);
01986 #endif  /* SUPPORT_UCP */
01987             }
01988           else
01989 #endif  /* SUPPORT_UTF8 */
01990           otherd = fcc[d];
01991           }
01992         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
01993           {
01994           if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
01995             {
01996             active_count--;            /* Remove non-match possibility */
01997             next_active_state--;
01998             }
01999           ADD_NEW(state_offset, 0);
02000           }
02001         }
02002       break;
02003 
02004       /*-----------------------------------------------------------------*/
02005       case OP_EXACT:
02006       case OP_NOTEXACT:
02007       count = current_state->count;  /* Number already matched */
02008       if (clen > 0)
02009         {
02010         unsigned int otherd = NOTACHAR;
02011         if ((ims & PCRE_CASELESS) != 0)
02012           {
02013 #ifdef SUPPORT_UTF8
02014           if (utf8 && d >= 128)
02015             {
02016 #ifdef SUPPORT_UCP
02017             otherd = UCD_OTHERCASE(d);
02018 #endif  /* SUPPORT_UCP */
02019             }
02020           else
02021 #endif  /* SUPPORT_UTF8 */
02022           otherd = fcc[d];
02023           }
02024         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
02025           {
02026           if (++count >= GET2(code, 1))
02027             { ADD_NEW(state_offset + dlen + 3, 0); }
02028           else
02029             { ADD_NEW(state_offset, count); }
02030           }
02031         }
02032       break;
02033 
02034       /*-----------------------------------------------------------------*/
02035       case OP_UPTO:
02036       case OP_MINUPTO:
02037       case OP_POSUPTO:
02038       case OP_NOTUPTO:
02039       case OP_NOTMINUPTO:
02040       case OP_NOTPOSUPTO:
02041       ADD_ACTIVE(state_offset + dlen + 3, 0);
02042       count = current_state->count;  /* Number already matched */
02043       if (clen > 0)
02044         {
02045         unsigned int otherd = NOTACHAR;
02046         if ((ims & PCRE_CASELESS) != 0)
02047           {
02048 #ifdef SUPPORT_UTF8
02049           if (utf8 && d >= 128)
02050             {
02051 #ifdef SUPPORT_UCP
02052             otherd = UCD_OTHERCASE(d);
02053 #endif  /* SUPPORT_UCP */
02054             }
02055           else
02056 #endif  /* SUPPORT_UTF8 */
02057           otherd = fcc[d];
02058           }
02059         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
02060           {
02061           if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
02062             {
02063             active_count--;             /* Remove non-match possibility */
02064             next_active_state--;
02065             }
02066           if (++count >= GET2(code, 1))
02067             { ADD_NEW(state_offset + dlen + 3, 0); }
02068           else
02069             { ADD_NEW(state_offset, count); }
02070           }
02071         }
02072       break;
02073 
02074 
02075 /* ========================================================================== */
02076       /* These are the class-handling opcodes */
02077 
02078       case OP_CLASS:
02079       case OP_NCLASS:
02080       case OP_XCLASS:
02081         {
02082         BOOL isinclass = FALSE;
02083         int next_state_offset;
02084         const uschar *ecode;
02085 
02086         /* For a simple class, there is always just a 32-byte table, and we
02087         can set isinclass from it. */
02088 
02089         if (codevalue != OP_XCLASS)
02090           {
02091           ecode = code + 33;
02092           if (clen > 0)
02093             {
02094             isinclass = (c > 255)? (codevalue == OP_NCLASS) :
02095               ((code[1 + c/8] & (1 << (c&7))) != 0);
02096             }
02097           }
02098 
02099         /* An extended class may have a table or a list of single characters,
02100         ranges, or both, and it may be positive or negative. There's a
02101         function that sorts all this out. */
02102 
02103         else
02104          {
02105          ecode = code + GET(code, 1);
02106          if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
02107          }
02108 
02109         /* At this point, isinclass is set for all kinds of class, and ecode
02110         points to the byte after the end of the class. If there is a
02111         quantifier, this is where it will be. */
02112 
02113         next_state_offset = ecode - start_code;
02114 
02115         switch (*ecode)
02116           {
02117           case OP_CRSTAR:
02118           case OP_CRMINSTAR:
02119           ADD_ACTIVE(next_state_offset + 1, 0);
02120           if (isinclass) { ADD_NEW(state_offset, 0); }
02121           break;
02122 
02123           case OP_CRPLUS:
02124           case OP_CRMINPLUS:
02125           count = current_state->count;  /* Already matched */
02126           if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
02127           if (isinclass) { count++; ADD_NEW(state_offset, count); }
02128           break;
02129 
02130           case OP_CRQUERY:
02131           case OP_CRMINQUERY:
02132           ADD_ACTIVE(next_state_offset + 1, 0);
02133           if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
02134           break;
02135 
02136           case OP_CRRANGE:
02137           case OP_CRMINRANGE:
02138           count = current_state->count;  /* Already matched */
02139           if (count >= GET2(ecode, 1))
02140             { ADD_ACTIVE(next_state_offset + 5, 0); }
02141           if (isinclass)
02142             {
02143             int max = GET2(ecode, 3);
02144             if (++count >= max && max != 0)   /* Max 0 => no limit */
02145               { ADD_NEW(next_state_offset + 5, 0); }
02146             else
02147               { ADD_NEW(state_offset, count); }
02148             }
02149           break;
02150 
02151           default:
02152           if (isinclass) { ADD_NEW(next_state_offset, 0); }
02153           break;
02154           }
02155         }
02156       break;
02157 
02158 /* ========================================================================== */
02159       /* These are the opcodes for fancy brackets of various kinds. We have
02160       to use recursion in order to handle them. The "always failing" assersion
02161       (?!) is optimised when compiling to OP_FAIL, so we have to support that,
02162       though the other "backtracking verbs" are not supported. */
02163 
02164       case OP_FAIL:
02165       break;
02166 
02167       case OP_ASSERT:
02168       case OP_ASSERT_NOT:
02169       case OP_ASSERTBACK:
02170       case OP_ASSERTBACK_NOT:
02171         {
02172         int rc;
02173         int local_offsets[2];
02174         int local_workspace[1000];
02175         const uschar *endasscode = code + GET(code, 1);
02176 
02177         while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
02178 
02179         rc = internal_dfa_exec(
02180           md,                                   /* static match data */
02181           code,                                 /* this subexpression's code */
02182           ptr,                                  /* where we currently are */
02183           ptr - start_subject,                  /* start offset */
02184           local_offsets,                        /* offset vector */
02185           sizeof(local_offsets)/sizeof(int),    /* size of same */
02186           local_workspace,                      /* workspace vector */
02187           sizeof(local_workspace)/sizeof(int),  /* size of same */
02188           ims,                                  /* the current ims flags */
02189           rlevel,                               /* function recursion level */
02190           recursing);                           /* pass on regex recursion */
02191 
02192         if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
02193             { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
02194         }
02195       break;
02196 
02197       /*-----------------------------------------------------------------*/
02198       case OP_COND:
02199       case OP_SCOND:
02200         {
02201         int local_offsets[1000];
02202         int local_workspace[1000];
02203         int condcode = code[LINK_SIZE+1];
02204 
02205         /* Back reference conditions are not supported */
02206 
02207         if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;
02208 
02209         /* The DEFINE condition is always false */
02210 
02211         if (condcode == OP_DEF)
02212           {
02213           ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);
02214           }
02215 
02216         /* The only supported version of OP_RREF is for the value RREF_ANY,
02217         which means "test if in any recursion". We can't test for specifically
02218         recursed groups. */
02219 
02220         else if (condcode == OP_RREF)
02221           {
02222           int value = GET2(code, LINK_SIZE+2);
02223           if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
02224           if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
02225             else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
02226           }
02227 
02228         /* Otherwise, the condition is an assertion */
02229 
02230         else
02231           {
02232           int rc;
02233           const uschar *asscode = code + LINK_SIZE + 1;
02234           const uschar *endasscode = asscode + GET(asscode, 1);
02235 
02236           while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
02237 
02238           rc = internal_dfa_exec(
02239             md,                                   /* fixed match data */
02240             asscode,                              /* this subexpression's code */
02241             ptr,                                  /* where we currently are */
02242             ptr - start_subject,                  /* start offset */
02243             local_offsets,                        /* offset vector */
02244             sizeof(local_offsets)/sizeof(int),    /* size of same */
02245             local_workspace,                      /* workspace vector */
02246             sizeof(local_workspace)/sizeof(int),  /* size of same */
02247             ims,                                  /* the current ims flags */
02248             rlevel,                               /* function recursion level */
02249             recursing);                           /* pass on regex recursion */
02250 
02251           if ((rc >= 0) ==
02252                 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
02253             { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
02254           else
02255             { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
02256           }
02257         }
02258       break;
02259 
02260       /*-----------------------------------------------------------------*/
02261       case OP_RECURSE:
02262         {
02263         int local_offsets[1000];
02264         int local_workspace[1000];
02265         int rc;
02266 
02267         DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
02268           recursing + 1));
02269 
02270         rc = internal_dfa_exec(
02271           md,                                   /* fixed match data */
02272           start_code + GET(code, 1),            /* this subexpression's code */
02273           ptr,                                  /* where we currently are */
02274           ptr - start_subject,                  /* start offset */
02275           local_offsets,                        /* offset vector */
02276           sizeof(local_offsets)/sizeof(int),    /* size of same */
02277           local_workspace,                      /* workspace vector */
02278           sizeof(local_workspace)/sizeof(int),  /* size of same */
02279           ims,                                  /* the current ims flags */
02280           rlevel,                               /* function recursion level */
02281           recursing + 1);                       /* regex recurse level */
02282 
02283         DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
02284           recursing + 1, rc));
02285 
02286         /* Ran out of internal offsets */
02287 
02288         if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
02289 
02290         /* For each successful matched substring, set up the next state with a
02291         count of characters to skip before trying it. Note that the count is in
02292         characters, not bytes. */
02293 
02294         if (rc > 0)
02295           {
02296           for (rc = rc*2 - 2; rc >= 0; rc -= 2)
02297             {
02298             const uschar *p = start_subject + local_offsets[rc];
02299             const uschar *pp = start_subject + local_offsets[rc+1];
02300             int charcount = local_offsets[rc+1] - local_offsets[rc];
02301             while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
02302             if (charcount > 0)
02303               {
02304               ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
02305               }
02306             else
02307               {
02308               ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
02309               }
02310             }
02311           }
02312         else if (rc != PCRE_ERROR_NOMATCH) return rc;
02313         }
02314       break;
02315 
02316       /*-----------------------------------------------------------------*/
02317       case OP_ONCE:
02318         {
02319         int local_offsets[2];
02320         int local_workspace[1000];
02321 
02322         int rc = internal_dfa_exec(
02323           md,                                   /* fixed match data */
02324           code,                                 /* this subexpression's code */
02325           ptr,                                  /* where we currently are */
02326           ptr - start_subject,                  /* start offset */
02327           local_offsets,                        /* offset vector */
02328           sizeof(local_offsets)/sizeof(int),    /* size of same */
02329           local_workspace,                      /* workspace vector */
02330           sizeof(local_workspace)/sizeof(int),  /* size of same */
02331           ims,                                  /* the current ims flags */
02332           rlevel,                               /* function recursion level */
02333           recursing);                           /* pass on regex recursion */
02334 
02335         if (rc >= 0)
02336           {
02337           const uschar *end_subpattern = code;
02338           int charcount = local_offsets[1] - local_offsets[0];
02339           int next_state_offset, repeat_state_offset;
02340 
02341           do { end_subpattern += GET(end_subpattern, 1); }
02342             while (*end_subpattern == OP_ALT);
02343           next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
02344 
02345           /* If the end of this subpattern is KETRMAX or KETRMIN, we must
02346           arrange for the repeat state also to be added to the relevant list.
02347           Calculate the offset, or set -1 for no repeat. */
02348 
02349           repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
02350                                  *end_subpattern == OP_KETRMIN)?
02351             end_subpattern - start_code - GET(end_subpattern, 1) : -1;
02352 
02353           /* If we have matched an empty string, add the next state at the
02354           current character pointer. This is important so that the duplicate
02355           checking kicks in, which is what breaks infinite loops that match an
02356           empty string. */
02357 
02358           if (charcount == 0)
02359             {
02360             ADD_ACTIVE(next_state_offset, 0);
02361             }
02362 
02363           /* Optimization: if there are no more active states, and there
02364           are no new states yet set up, then skip over the subject string
02365           right here, to save looping. Otherwise, set up the new state to swing
02366           into action when the end of the substring is reached. */
02367 
02368           else if (i + 1 >= active_count && new_count == 0)
02369             {
02370             ptr += charcount;
02371             clen = 0;
02372             ADD_NEW(next_state_offset, 0);
02373 
02374             /* If we are adding a repeat state at the new character position,
02375             we must fudge things so that it is the only current state.
02376             Otherwise, it might be a duplicate of one we processed before, and
02377             that would cause it to be skipped. */
02378 
02379             if (repeat_state_offset >= 0)
02380               {
02381               next_active_state = active_states;
02382               active_count = 0;
02383               i = -1;
02384               ADD_ACTIVE(repeat_state_offset, 0);
02385               }
02386             }
02387           else
02388             {
02389             const uschar *p = start_subject + local_offsets[0];
02390             const uschar *pp = start_subject + local_offsets[1];
02391             while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
02392             ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
02393             if (repeat_state_offset >= 0)
02394               { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
02395             }
02396 
02397           }
02398         else if (rc != PCRE_ERROR_NOMATCH) return rc;
02399         }
02400       break;
02401 
02402 
02403 /* ========================================================================== */
02404       /* Handle callouts */
02405 
02406       case OP_CALLOUT:
02407       if (pcre_callout != NULL)
02408         {
02409         int rrc;
02410         pcre_callout_block cb;
02411         cb.version          = 1;   /* Version 1 of the callout block */
02412         cb.callout_number   = code[1];
02413         cb.offset_vector    = offsets;
02414         cb.subject          = (PCRE_SPTR)start_subject;
02415         cb.subject_length   = end_subject - start_subject;
02416         cb.start_match      = current_subject - start_subject;
02417         cb.current_position = ptr - start_subject;
02418         cb.pattern_position = GET(code, 2);
02419         cb.next_item_length = GET(code, 2 + LINK_SIZE);
02420         cb.capture_top      = 1;
02421         cb.capture_last     = -1;
02422         cb.callout_data     = md->callout_data;
02423         if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */
02424         if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }
02425         }
02426       break;
02427 
02428 
02429 /* ========================================================================== */
02430       default:        /* Unsupported opcode */
02431       return PCRE_ERROR_DFA_UITEM;
02432       }
02433 
02434     NEXT_ACTIVE_STATE: continue;
02435 
02436     }      /* End of loop scanning active states */
02437 
02438   /* We have finished the processing at the current subject character. If no
02439   new states have been set for the next character, we have found all the
02440   matches that we are going to find. If we are at the top level and partial
02441   matching has been requested, check for appropriate conditions. */
02442 
02443   if (new_count <= 0)
02444     {
02445     if (match_count < 0 &&                     /* No matches found */
02446         rlevel == 1 &&                         /* Top level match function */
02447         (md->moptions & PCRE_PARTIAL) != 0 &&  /* Want partial matching */
02448         ptr >= end_subject &&                  /* Reached end of subject */
02449         ptr > current_subject)                 /* Matched non-empty string */
02450       {
02451       if (offsetcount >= 2)
02452         {
02453         offsets[0] = current_subject - start_subject;
02454         offsets[1] = end_subject - start_subject;
02455         }
02456       match_count = PCRE_ERROR_PARTIAL;
02457       }
02458 
02459     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
02460       "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
02461       rlevel*2-2, SP));
02462     break;        /* In effect, "return", but see the comment below */
02463     }
02464 
02465   /* One or more states are active for the next character. */
02466 
02467   ptr += clen;    /* Advance to next subject character */
02468   }               /* Loop to move along the subject string */
02469 
02470 /* Control gets here from "break" a few lines above. We do it this way because
02471 if we use "return" above, we have compiler trouble. Some compilers warn if
02472 there's nothing here because they think the function doesn't return a value. On
02473 the other hand, if we put a dummy statement here, some more clever compilers
02474 complain that it can't be reached. Sigh. */
02475 
02476 return match_count;
02477 }
02478 
02479 
02480 
02481 
02482 /*************************************************
02483 *    Execute a Regular Expression - DFA engine   *
02484 *************************************************/
02485 
02486 /* This external function applies a compiled re to a subject string using a DFA
02487 engine. This function calls the internal function multiple times if the pattern
02488 is not anchored.
02489 
02490 Arguments:
02491   argument_re     points to the compiled expression
02492   extra_data      points to extra data or is NULL
02493   subject         points to the subject string
02494   length          length of subject string (may contain binary zeros)
02495   start_offset    where to start in the subject string
02496   options         option bits
02497   offsets         vector of match offsets
02498   offsetcount     size of same
02499   workspace       workspace vector
02500   wscount         size of same
02501 
02502 Returns:          > 0 => number of match offset pairs placed in offsets
02503                   = 0 => offsets overflowed; longest matches are present
02504                    -1 => failed to match
02505                  < -1 => some kind of unexpected problem
02506 */
02507 
02508 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
02509 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
02510   const char *subject, int length, int start_offset, int options, int *offsets,
02511   int offsetcount, int *workspace, int wscount)
02512 {
02513 real_pcre *re = (real_pcre *)argument_re;
02514 dfa_match_data match_block;
02515 dfa_match_data *md = &match_block;
02516 BOOL utf8, anchored, startline, firstline;
02517 const uschar *current_subject, *end_subject, *lcc;
02518 
02519 pcre_study_data internal_study;
02520 const pcre_study_data *study = NULL;
02521 real_pcre internal_re;
02522 
02523 const uschar *req_byte_ptr;
02524 const uschar *start_bits = NULL;
02525 BOOL first_byte_caseless = FALSE;
02526 BOOL req_byte_caseless = FALSE;
02527 int first_byte = -1;
02528 int req_byte = -1;
02529 int req_byte2 = -1;
02530 int newline;
02531 
02532 /* Plausibility checks */
02533 
02534 if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
02535 if (re == NULL || subject == NULL || workspace == NULL ||
02536    (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
02537 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
02538 if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
02539 
02540 /* We need to find the pointer to any study data before we test for byte
02541 flipping, so we scan the extra_data block first. This may set two fields in the
02542 match block, so we must initialize them beforehand. However, the other fields
02543 in the match block must not be set until after the byte flipping. */
02544 
02545 md->tables = re->tables;
02546 md->callout_data = NULL;
02547 
02548 if (extra_data != NULL)
02549   {
02550   unsigned int flags = extra_data->flags;
02551   if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
02552     study = (const pcre_study_data *)extra_data->study_data;
02553   if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
02554   if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
02555     return PCRE_ERROR_DFA_UMLIMIT;
02556   if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
02557     md->callout_data = extra_data->callout_data;
02558   if ((flags & PCRE_EXTRA_TABLES) != 0)
02559     md->tables = extra_data->tables;
02560   }
02561 
02562 /* Check that the first field in the block is the magic number. If it is not,
02563 test for a regex that was compiled on a host of opposite endianness. If this is
02564 the case, flipped values are put in internal_re and internal_study if there was
02565 study data too. */
02566 
02567 if (re->magic_number != MAGIC_NUMBER)
02568   {
02569   re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
02570   if (re == NULL) return PCRE_ERROR_BADMAGIC;
02571   if (study != NULL) study = &internal_study;
02572   }
02573 
02574 /* Set some local values */
02575 
02576 current_subject = (const unsigned char *)subject + start_offset;
02577 end_subject = (const unsigned char *)subject + length;
02578 req_byte_ptr = current_subject - 1;
02579 
02580 #ifdef SUPPORT_UTF8
02581 utf8 = (re->options & PCRE_UTF8) != 0;
02582 #else
02583 utf8 = FALSE;
02584 #endif
02585 
02586 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
02587   (re->options & PCRE_ANCHORED) != 0;
02588 
02589 /* The remaining fixed data for passing around. */
02590 
02591 md->start_code = (const uschar *)argument_re +
02592     re->name_table_offset + re->name_count * re->name_entry_size;
02593 md->start_subject = (const unsigned char *)subject;
02594 md->end_subject = end_subject;
02595 md->moptions = options;
02596 md->poptions = re->options;
02597 
02598 /* If the BSR option is not set at match time, copy what was set
02599 at compile time. */
02600 
02601 if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
02602   {
02603   if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
02604     md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
02605 #ifdef BSR_ANYCRLF
02606   else md->moptions |= PCRE_BSR_ANYCRLF;
02607 #endif
02608   }
02609 
02610 /* Handle different types of newline. The three bits give eight cases. If
02611 nothing is set at run time, whatever was used at compile time applies. */
02612 
02613 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
02614          PCRE_NEWLINE_BITS)
02615   {
02616   case 0: newline = NEWLINE; break;   /* Compile-time default */
02617   case PCRE_NEWLINE_CR: newline = '\r'; break;
02618   case PCRE_NEWLINE_LF: newline = '\n'; break;
02619   case PCRE_NEWLINE_CR+
02620        PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
02621   case PCRE_NEWLINE_ANY: newline = -1; break;
02622   case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
02623   default: return PCRE_ERROR_BADNEWLINE;
02624   }
02625 
02626 if (newline == -2)
02627   {
02628   md->nltype = NLTYPE_ANYCRLF;
02629   }
02630 else if (newline < 0)
02631   {
02632   md->nltype = NLTYPE_ANY;
02633   }
02634 else
02635   {
02636   md->nltype = NLTYPE_FIXED;
02637   if (newline > 255)
02638     {
02639     md->nllen = 2;
02640     md->nl[0] = (newline >> 8) & 255;
02641     md->nl[1] = newline & 255;
02642     }
02643   else
02644     {
02645     md->nllen = 1;
02646     md->nl[0] = newline;
02647     }
02648   }
02649 
02650 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
02651 back the character offset. */
02652 
02653 #ifdef SUPPORT_UTF8
02654 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
02655   {
02656   if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
02657     return PCRE_ERROR_BADUTF8;
02658   if (start_offset > 0 && start_offset < length)
02659     {
02660     int tb = ((uschar *)subject)[start_offset];
02661     if (tb > 127)
02662       {
02663       tb &= 0xc0;
02664       if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
02665       }
02666     }
02667   }
02668 #endif
02669 
02670 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
02671 is a feature that makes it possible to save compiled regex and re-use them
02672 in other programs later. */
02673 
02674 if (md->tables == NULL) md->tables = _pcre_default_tables;
02675 
02676 /* The lower casing table and the "must be at the start of a line" flag are
02677 used in a loop when finding where to start. */
02678 
02679 lcc = md->tables + lcc_offset;
02680 startline = (re->flags & PCRE_STARTLINE) != 0;
02681 firstline = (re->options & PCRE_FIRSTLINE) != 0;
02682 
02683 /* Set up the first character to match, if available. The first_byte value is
02684 never set for an anchored regular expression, but the anchoring may be forced
02685 at run time, so we have to test for anchoring. The first char may be unset for
02686 an unanchored pattern, of course. If there's no first char and the pattern was
02687 studied, there may be a bitmap of possible first characters. */
02688 
02689 if (!anchored)
02690   {
02691   if ((re->flags & PCRE_FIRSTSET) != 0)
02692     {
02693     first_byte = re->first_byte & 255;
02694     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
02695       first_byte = lcc[first_byte];
02696     }
02697   else
02698     {
02699     if (startline && study != NULL &&
02700          (study->options & PCRE_STUDY_MAPPED) != 0)
02701       start_bits = study->start_bits;
02702     }
02703   }
02704 
02705 /* For anchored or unanchored matches, there may be a "last known required
02706 character" set. */
02707 
02708 if ((re->flags & PCRE_REQCHSET) != 0)
02709   {
02710   req_byte = re->req_byte & 255;
02711   req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
02712   req_byte2 = (md->tables + fcc_offset)[req_byte];  /* case flipped */
02713   }
02714 
02715 /* Call the main matching function, looping for a non-anchored regex after a
02716 failed match. Unless restarting, optimize by moving to the first match
02717 character if possible, when not anchored. Then unless wanting a partial match,
02718 check for a required later character. */
02719 
02720 for (;;)
02721   {
02722   int rc;
02723 
02724   if ((options & PCRE_DFA_RESTART) == 0)
02725     {
02726     const uschar *save_end_subject = end_subject;
02727 
02728     /* Advance to a unique first char if possible. If firstline is TRUE, the
02729     start of the match is constrained to the first line of a multiline string.
02730     Implement this by temporarily adjusting end_subject so that we stop
02731     scanning at a newline. If the match fails at the newline, later code breaks
02732     this loop. */
02733 
02734     if (firstline)
02735       {
02736       USPTR t = current_subject;
02737 #ifdef SUPPORT_UTF8
02738       if (utf8)
02739         {
02740         while (t < md->end_subject && !IS_NEWLINE(t))
02741           {
02742           t++;
02743           while (t < end_subject && (*t & 0xc0) == 0x80) t++;
02744           }
02745         }
02746       else
02747 #endif
02748       while (t < md->end_subject && !IS_NEWLINE(t)) t++;
02749       end_subject = t;
02750       }
02751 
02752     if (first_byte >= 0)
02753       {
02754       if (first_byte_caseless)
02755         while (current_subject < end_subject &&
02756                lcc[*current_subject] != first_byte)
02757           current_subject++;
02758       else
02759         while (current_subject < end_subject && *current_subject != first_byte)
02760           current_subject++;
02761       }
02762 
02763     /* Or to just after a linebreak for a multiline match if possible */
02764 
02765     else if (startline)
02766       {
02767       if (current_subject > md->start_subject + start_offset)
02768         {
02769 #ifdef SUPPORT_UTF8
02770         if (utf8)
02771           {
02772           while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
02773             {
02774             current_subject++;
02775             while(current_subject < end_subject &&
02776                   (*current_subject & 0xc0) == 0x80)
02777               current_subject++;
02778             }
02779           }
02780         else
02781 #endif
02782         while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
02783           current_subject++;
02784 
02785         /* If we have just passed a CR and the newline option is ANY or
02786         ANYCRLF, and we are now at a LF, advance the match position by one more
02787         character. */
02788 
02789         if (current_subject[-1] == '\r' &&
02790              (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
02791              current_subject < end_subject &&
02792              *current_subject == '\n')
02793           current_subject++;
02794         }
02795       }
02796 
02797     /* Or to a non-unique first char after study */
02798 
02799     else if (start_bits != NULL)
02800       {
02801       while (current_subject < end_subject)
02802         {
02803         register unsigned int c = *current_subject;
02804         if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
02805           else break;
02806         }
02807       }
02808 
02809     /* Restore fudged end_subject */
02810 
02811     end_subject = save_end_subject;
02812     }
02813 
02814   /* If req_byte is set, we know that that character must appear in the subject
02815   for the match to succeed. If the first character is set, req_byte must be
02816   later in the subject; otherwise the test starts at the match point. This
02817   optimization can save a huge amount of work in patterns with nested unlimited
02818   repeats that aren't going to match. Writing separate code for cased/caseless
02819   versions makes it go faster, as does using an autoincrement and backing off
02820   on a match.
02821 
02822   HOWEVER: when the subject string is very, very long, searching to its end can
02823   take a long time, and give bad performance on quite ordinary patterns. This
02824   showed up when somebody was matching /^C/ on a 32-megabyte string... so we
02825   don't do this when the string is sufficiently long.
02826 
02827   ALSO: this processing is disabled when partial matching is requested.
02828   */
02829 
02830   if (req_byte >= 0 &&
02831       end_subject - current_subject < REQ_BYTE_MAX &&
02832       (options & PCRE_PARTIAL) == 0)
02833     {
02834     register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
02835 
02836     /* We don't need to repeat the search if we haven't yet reached the
02837     place we found it at last time. */
02838 
02839     if (p > req_byte_ptr)
02840       {
02841       if (req_byte_caseless)
02842         {
02843         while (p < end_subject)
02844           {
02845           register int pp = *p++;
02846           if (pp == req_byte || pp == req_byte2) { p--; break; }
02847           }
02848         }
02849       else
02850         {
02851         while (p < end_subject)
02852           {
02853           if (*p++ == req_byte) { p--; break; }
02854           }
02855         }
02856 
02857       /* If we can't find the required character, break the matching loop,
02858       which will cause a return or PCRE_ERROR_NOMATCH. */
02859 
02860       if (p >= end_subject) break;
02861 
02862       /* If we have found the required character, save the point where we
02863       found it, so that we don't search again next time round the loop if
02864       the start hasn't passed this character yet. */
02865 
02866       req_byte_ptr = p;
02867       }
02868     }
02869 
02870   /* OK, now we can do the business */
02871 
02872   rc = internal_dfa_exec(
02873     md,                                /* fixed match data */
02874     md->start_code,                    /* this subexpression's code */
02875     current_subject,                   /* where we currently are */
02876     start_offset,                      /* start offset in subject */
02877     offsets,                           /* offset vector */
02878     offsetcount,                       /* size of same */
02879     workspace,                         /* workspace vector */
02880     wscount,                           /* size of same */
02881     re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
02882     0,                                 /* function recurse level */
02883     0);                                /* regex recurse level */
02884 
02885   /* Anything other than "no match" means we are done, always; otherwise, carry
02886   on only if not anchored. */
02887 
02888   if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
02889 
02890   /* Advance to the next subject character unless we are at the end of a line
02891   and firstline is set. */
02892 
02893   if (firstline && IS_NEWLINE(current_subject)) break;
02894   current_subject++;
02895   if (utf8)
02896     {
02897     while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
02898       current_subject++;
02899     }
02900   if (current_subject > end_subject) break;
02901 
02902   /* If we have just passed a CR and we are now at a LF, and the pattern does
02903   not contain any explicit matches for \r or \n, and the newline option is CRLF
02904   or ANY or ANYCRLF, advance the match position by one more character. */
02905 
02906   if (current_subject[-1] == '\r' &&
02907       current_subject < end_subject &&
02908       *current_subject == '\n' &&
02909       (re->flags & PCRE_HASCRORLF) == 0 &&
02910         (md->nltype == NLTYPE_ANY ||
02911          md->nltype == NLTYPE_ANYCRLF ||
02912          md->nllen == 2))
02913     current_subject++;
02914 
02915   }   /* "Bumpalong" loop */
02916 
02917 return PCRE_ERROR_NOMATCH;
02918 }
02919 
02920 /* End of pcre_dfa_exec.c */

Generated on Tue Jul 5 14:11:57 2011 for ROOT_528-00b_version by  doxygen 1.5.1