pcre_compile.c

Go to the documentation of this file.
00001 /*************************************************
00002 *      Perl-Compatible Regular Expressions       *
00003 *************************************************/
00004 
00005 /* PCRE is a library of functions to support regular expressions whose syntax
00006 and semantics are as close as possible to those of the Perl 5 language.
00007 
00008                        Written by Philip Hazel
00009            Copyright (c) 1997-2008 University of Cambridge
00010 
00011 -----------------------------------------------------------------------------
00012 Redistribution and use in source and binary forms, with or without
00013 modification, are permitted provided that the following conditions are met:
00014 
00015     * Redistributions of source code must retain the above copyright notice,
00016       this list of conditions and the following disclaimer.
00017 
00018     * Redistributions in binary form must reproduce the above copyright
00019       notice, this list of conditions and the following disclaimer in the
00020       documentation and/or other materials provided with the distribution.
00021 
00022     * Neither the name of the University of Cambridge nor the names of its
00023       contributors may be used to endorse or promote products derived from
00024       this software without specific prior written permission.
00025 
00026 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
00027 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00028 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00029 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
00030 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
00031 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
00032 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
00033 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
00034 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
00035 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00036 POSSIBILITY OF SUCH DAMAGE.
00037 -----------------------------------------------------------------------------
00038 */
00039 
00040 
00041 /* This module contains the external function pcre_compile(), along with
00042 supporting internal functions that are not used by other modules. */
00043 
00044 
00045 #ifdef HAVE_CONFIG_H
00046 #include "config.h"
00047 #endif
00048 
00049 #define NLBLOCK cd             /* Block containing newline information */
00050 #define PSSTART start_pattern  /* Field containing processed string start */
00051 #define PSEND   end_pattern    /* Field containing processed string end */
00052 
00053 #include "pcre_internal.h"
00054 
00055 
00056 /* When DEBUG is defined, we need the pcre_printint() function, which is also
00057 used by pcretest. DEBUG is not defined when building a production library. */
00058 
00059 #ifdef DEBUG
00060 #include "pcre_printint.src"
00061 #endif
00062 
00063 
00064 /* Macro for setting individual bits in class bitmaps. */
00065 
00066 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
00067 
00068 /* Maximum length value to check against when making sure that the integer that
00069 holds the compiled pattern length does not overflow. We make it a bit less than
00070 INT_MAX to allow for adding in group terminating bytes, so that we don't have
00071 to check them every time. */
00072 
00073 #define OFLOW_MAX (INT_MAX - 20)
00074 
00075 
00076 /*************************************************
00077 *      Code parameters and static tables         *
00078 *************************************************/
00079 
00080 /* This value specifies the size of stack workspace that is used during the
00081 first pre-compile phase that determines how much memory is required. The regex
00082 is partly compiled into this space, but the compiled parts are discarded as
00083 soon as they can be, so that hopefully there will never be an overrun. The code
00084 does, however, check for an overrun. The largest amount I've seen used is 218,
00085 so this number is very generous.
00086 
00087 The same workspace is used during the second, actual compile phase for
00088 remembering forward references to groups so that they can be filled in at the
00089 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
00090 is 4 there is plenty of room. */
00091 
00092 #define COMPILE_WORK_SIZE (4096)
00093 
00094 
00095 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
00096 are simple data values; negative values are for special things like \d and so
00097 on. Zero means further processing is needed (for things like \x), or the escape
00098 is invalid. */
00099 
00100 #ifndef EBCDIC  /* This is the "normal" table for ASCII systems */
00101 static const short int escapes[] = {
00102      0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
00103      0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
00104    '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
00105 -ESC_H,      0,      0, -ESC_K,      0,      0,      0,      0,   /* H - O */
00106 -ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0, -ESC_V, -ESC_W,   /* P - W */
00107 -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
00108    '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
00109 -ESC_h,      0,      0, -ESC_k,      0,      0,  ESC_n,      0,   /* h - o */
00110 -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0, -ESC_v, -ESC_w,   /* p - w */
00111      0,      0, -ESC_z                                            /* x - z */
00112 };
00113 
00114 #else           /* This is the "abnormal" table for EBCDIC systems */
00115 static const short int escapes[] = {
00116 /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
00117 /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
00118 /*  58 */     0,     0,    '!',     '$',    '*',   ')',    ';',    '~',
00119 /*  60 */   '-',   '/',      0,       0,      0,     0,      0,      0,
00120 /*  68 */     0,     0,    '|',     ',',    '%',   '_',    '>',    '?',
00121 /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
00122 /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
00123 /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
00124 /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
00125 /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
00126 /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
00127 /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
00128 /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
00129 /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
00130 /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
00131 /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
00132 /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
00133 /*  D0 */   '}',     0, -ESC_K,       0,      0,     0,      0, -ESC_P,
00134 /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
00135 /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
00136 /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
00137 /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
00138 /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
00139 };
00140 #endif
00141 
00142 
00143 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
00144 searched linearly. Put all the names into a single string, in order to reduce
00145 the number of relocations when a shared library is dynamically linked. */
00146 
00147 typedef struct verbitem {
00148   int   len;
00149   int   op;
00150 } verbitem;
00151 
00152 static const char verbnames[] =
00153   "ACCEPT\0"
00154   "COMMIT\0"
00155   "F\0"
00156   "FAIL\0"
00157   "PRUNE\0"
00158   "SKIP\0"
00159   "THEN";
00160 
00161 static const verbitem verbs[] = {
00162   { 6, OP_ACCEPT },
00163   { 6, OP_COMMIT },
00164   { 1, OP_FAIL },
00165   { 4, OP_FAIL },
00166   { 5, OP_PRUNE },
00167   { 4, OP_SKIP  },
00168   { 4, OP_THEN  }
00169 };
00170 
00171 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
00172 
00173 
00174 /* Tables of names of POSIX character classes and their lengths. The names are
00175 now all in a single string, to reduce the number of relocations when a shared
00176 library is dynamically loaded. The list of lengths is terminated by a zero
00177 length entry. The first three must be alpha, lower, upper, as this is assumed
00178 for handling case independence. */
00179 
00180 static const char posix_names[] =
00181   "alpha\0"  "lower\0"  "upper\0"  "alnum\0"  "ascii\0"  "blank\0"
00182   "cntrl\0"  "digit\0"  "graph\0"  "print\0"  "punct\0"  "space\0"
00183   "word\0"   "xdigit";
00184 
00185 static const uschar posix_name_lengths[] = {
00186   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
00187 
00188 /* Table of class bit maps for each POSIX class. Each class is formed from a
00189 base map, with an optional addition or removal of another map. Then, for some
00190 classes, there is some additional tweaking: for [:blank:] the vertical space
00191 characters are removed, and for [:alpha:] and [:alnum:] the underscore
00192 character is removed. The triples in the table consist of the base map offset,
00193 second map offset or -1 if no second map, and a non-negative value for map
00194 addition or a negative value for map subtraction (if there are two maps). The
00195 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
00196 remove vertical space characters, 2 => remove underscore. */
00197 
00198 static const int posix_class_maps[] = {
00199   cbit_word,  cbit_digit, -2,             /* alpha */
00200   cbit_lower, -1,          0,             /* lower */
00201   cbit_upper, -1,          0,             /* upper */
00202   cbit_word,  -1,          2,             /* alnum - word without underscore */
00203   cbit_print, cbit_cntrl,  0,             /* ascii */
00204   cbit_space, -1,          1,             /* blank - a GNU extension */
00205   cbit_cntrl, -1,          0,             /* cntrl */
00206   cbit_digit, -1,          0,             /* digit */
00207   cbit_graph, -1,          0,             /* graph */
00208   cbit_print, -1,          0,             /* print */
00209   cbit_punct, -1,          0,             /* punct */
00210   cbit_space, -1,          0,             /* space */
00211   cbit_word,  -1,          0,             /* word - a Perl extension */
00212   cbit_xdigit,-1,          0              /* xdigit */
00213 };
00214 
00215 
00216 #define STRING(a)  # a
00217 #define XSTRING(s) STRING(s)
00218 
00219 /* The texts of compile-time error messages. These are "char *" because they
00220 are passed to the outside world. Do not ever re-use any error number, because
00221 they are documented. Always add a new error instead. Messages marked DEAD below
00222 are no longer used. This used to be a table of strings, but in order to reduce
00223 the number of relocations needed when a shared library is loaded dynamically,
00224 it is now one long string. We cannot use a table of offsets, because the
00225 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
00226 simply count through to the one we want - this isn't a performance issue
00227 because these strings are used only when there is a compilation error. */
00228 
00229 static const char error_texts[] =
00230   "no error\0"
00231   "\\ at end of pattern\0"
00232   "\\c at end of pattern\0"
00233   "unrecognized character follows \\\0"
00234   "numbers out of order in {} quantifier\0"
00235   /* 5 */
00236   "number too big in {} quantifier\0"
00237   "missing terminating ] for character class\0"
00238   "invalid escape sequence in character class\0"
00239   "range out of order in character class\0"
00240   "nothing to repeat\0"
00241   /* 10 */
00242   "operand of unlimited repeat could match the empty string\0"  /** DEAD **/
00243   "internal error: unexpected repeat\0"
00244   "unrecognized character after (? or (?-\0"
00245   "POSIX named classes are supported only within a class\0"
00246   "missing )\0"
00247   /* 15 */
00248   "reference to non-existent subpattern\0"
00249   "erroffset passed as NULL\0"
00250   "unknown option bit(s) set\0"
00251   "missing ) after comment\0"
00252   "parentheses nested too deeply\0"  /** DEAD **/
00253   /* 20 */
00254   "regular expression is too large\0"
00255   "failed to get memory\0"
00256   "unmatched parentheses\0"
00257   "internal error: code overflow\0"
00258   "unrecognized character after (?<\0"
00259   /* 25 */
00260   "lookbehind assertion is not fixed length\0"
00261   "malformed number or name after (?(\0"
00262   "conditional group contains more than two branches\0"
00263   "assertion expected after (?(\0"
00264   "(?R or (?[+-]digits must be followed by )\0"
00265   /* 30 */
00266   "unknown POSIX class name\0"
00267   "POSIX collating elements are not supported\0"
00268   "this version of PCRE is not compiled with PCRE_UTF8 support\0"
00269   "spare error\0"  /** DEAD **/
00270   "character value in \\x{...} sequence is too large\0"
00271   /* 35 */
00272   "invalid condition (?(0)\0"
00273   "\\C not allowed in lookbehind assertion\0"
00274   "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
00275   "number after (?C is > 255\0"
00276   "closing ) for (?C expected\0"
00277   /* 40 */
00278   "recursive call could loop indefinitely\0"
00279   "unrecognized character after (?P\0"
00280   "syntax error in subpattern name (missing terminator)\0"
00281   "two named subpatterns have the same name\0"
00282   "invalid UTF-8 string\0"
00283   /* 45 */
00284   "support for \\P, \\p, and \\X has not been compiled\0"
00285   "malformed \\P or \\p sequence\0"
00286   "unknown property name after \\P or \\p\0"
00287   "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
00288   "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
00289   /* 50 */
00290   "repeated subpattern is too long\0"    /** DEAD **/
00291   "octal value is greater than \\377 (not in UTF-8 mode)\0"
00292   "internal error: overran compiling workspace\0"
00293   "internal error: previously-checked referenced subpattern not found\0"
00294   "DEFINE group contains more than one branch\0"
00295   /* 55 */
00296   "repeating a DEFINE group is not allowed\0"
00297   "inconsistent NEWLINE options\0"
00298   "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
00299   "a numbered reference must not be zero\0"
00300   "(*VERB) with an argument is not supported\0"
00301   /* 60 */
00302   "(*VERB) not recognized\0"
00303   "number is too big\0"
00304   "subpattern name expected\0"
00305   "digit expected after (?+\0"
00306   "] is an invalid data character in JavaScript compatibility mode";
00307 
00308 
00309 /* Table to identify digits and hex digits. This is used when compiling
00310 patterns. Note that the tables in chartables are dependent on the locale, and
00311 may mark arbitrary characters as digits - but the PCRE compiling code expects
00312 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
00313 a private table here. It costs 256 bytes, but it is a lot faster than doing
00314 character value tests (at least in some simple cases I timed), and in some
00315 applications one wants PCRE to compile efficiently as well as match
00316 efficiently.
00317 
00318 For convenience, we use the same bit definitions as in chartables:
00319 
00320   0x04   decimal digit
00321   0x08   hexadecimal digit
00322 
00323 Then we can use ctype_digit and ctype_xdigit in the code. */
00324 
00325 #ifndef EBCDIC  /* This is the "normal" case, for ASCII systems */
00326 static const unsigned char digitab[] =
00327   {
00328   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
00329   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
00330   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
00331   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
00332   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - '  */
00333   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ( - /  */
00334   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  */
00335   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /*  8 - ?  */
00336   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  @ - G  */
00337   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H - O  */
00338   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  P - W  */
00339   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  X - _  */
00340   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  ` - g  */
00341   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h - o  */
00342   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  p - w  */
00343   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  x -127 */
00344   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
00345   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
00346   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
00347   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
00348   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
00349   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
00350   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
00351   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
00352   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
00353   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
00354   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
00355   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
00356   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
00357   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
00358   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
00359   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
00360 
00361 #else           /* This is the "abnormal" case, for EBCDIC systems */
00362 static const unsigned char digitab[] =
00363   {
00364   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
00365   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
00366   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 10 */
00367   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31    */
00368   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  32- 39 20 */
00369   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47    */
00370   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 30 */
00371   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63    */
00372   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
00373   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
00374   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
00375   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
00376   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
00377   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
00378   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
00379   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "     */
00380   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g  80 */
00381   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143    */
00382   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p  90 */
00383   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159    */
00384   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x  A0 */
00385   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175    */
00386   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 B0 */
00387   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191    */
00388   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  { - G  C0 */
00389   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207    */
00390   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  } - P  D0 */
00391   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223    */
00392   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  \ - X  E0 */
00393   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239    */
00394   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
00395   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
00396 
00397 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
00398   0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
00399   0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
00400   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
00401   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
00402   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  32- 39 */
00403   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47 */
00404   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 */
00405   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63 */
00406   0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
00407   0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
00408   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
00409   0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
00410   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
00411   0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
00412   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
00413   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "  */
00414   0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g  */
00415   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143 */
00416   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p  */
00417   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159 */
00418   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x  */
00419   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175 */
00420   0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 */
00421   0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
00422   0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  { - G  */
00423   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207 */
00424   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  } - P  */
00425   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223 */
00426   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /*  \ - X  */
00427   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239 */
00428   0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /*  0 - 7  */
00429   0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255 */
00430 #endif
00431 
00432 
00433 /* Definition to allow mutual recursion */
00434 
00435 static BOOL
00436   compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
00437     int *, int *, branch_chain *, compile_data *, int *);
00438 
00439 
00440 
00441 /*************************************************
00442 *            Find an error text                  *
00443 *************************************************/
00444 
00445 /* The error texts are now all in one long string, to save on relocations. As
00446 some of the text is of unknown length, we can't use a table of offsets.
00447 Instead, just count through the strings. This is not a performance issue
00448 because it happens only when there has been a compilation error.
00449 
00450 Argument:   the error number
00451 Returns:    pointer to the error string
00452 */
00453 
00454 static const char *
00455 find_error_text(int n)
00456 {
00457 const char *s = error_texts;
00458 for (; n > 0; n--) while (*s++ != 0) {};
00459 return s;
00460 }
00461 
00462 
00463 /*************************************************
00464 *            Handle escapes                      *
00465 *************************************************/
00466 
00467 /* This function is called when a \ has been encountered. It either returns a
00468 positive value for a simple escape such as \n, or a negative value which
00469 encodes one of the more complicated things such as \d. A backreference to group
00470 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
00471 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
00472 ptr is pointing at the \. On exit, it is on the final character of the escape
00473 sequence.
00474 
00475 Arguments:
00476   ptrptr         points to the pattern position pointer
00477   errorcodeptr   points to the errorcode variable
00478   bracount       number of previous extracting brackets
00479   options        the options bits
00480   isclass        TRUE if inside a character class
00481 
00482 Returns:         zero or positive => a data character
00483                  negative => a special escape sequence
00484                  on error, errorcodeptr is set
00485 */
00486 
00487 static int
00488 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
00489   int options, BOOL isclass)
00490 {
00491 BOOL utf8 = (options & PCRE_UTF8) != 0;
00492 const uschar *ptr = *ptrptr + 1;
00493 int c, i;
00494 
00495 GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
00496 ptr--;                            /* Set pointer back to the last byte */
00497 
00498 /* If backslash is at the end of the pattern, it's an error. */
00499 
00500 if (c == 0) *errorcodeptr = ERR1;
00501 
00502 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
00503 in a table. A non-zero result is something that can be returned immediately.
00504 Otherwise further processing may be required. */
00505 
00506 #ifndef EBCDIC  /* ASCII coding */
00507 else if (c < '0' || c > 'z') {}                           /* Not alphanumeric */
00508 else if ((i = escapes[c - '0']) != 0) c = i;
00509 
00510 #else           /* EBCDIC coding */
00511 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphanumeric */
00512 else if ((i = escapes[c - 0x48]) != 0)  c = i;
00513 #endif
00514 
00515 /* Escapes that need further processing, or are illegal. */
00516 
00517 else
00518   {
00519   const uschar *oldptr;
00520   BOOL braced, negated;
00521 
00522   switch (c)
00523     {
00524     /* A number of Perl escapes are not handled by PCRE. We give an explicit
00525     error. */
00526 
00527     case 'l':
00528     case 'L':
00529     case 'N':
00530     case 'u':
00531     case 'U':
00532     *errorcodeptr = ERR37;
00533     break;
00534 
00535     /* \g must be followed by one of a number of specific things:
00536 
00537     (1) A number, either plain or braced. If positive, it is an absolute
00538     backreference. If negative, it is a relative backreference. This is a Perl
00539     5.10 feature.
00540 
00541     (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
00542     is part of Perl's movement towards a unified syntax for back references. As
00543     this is synonymous with \k{name}, we fudge it up by pretending it really
00544     was \k.
00545 
00546     (3) For Oniguruma compatibility we also support \g followed by a name or a
00547     number either in angle brackets or in single quotes. However, these are
00548     (possibly recursive) subroutine calls, _not_ backreferences. Just return
00549     the -ESC_g code (cf \k). */
00550 
00551     case 'g':
00552     if (ptr[1] == '<' || ptr[1] == '\'')
00553       {
00554       c = -ESC_g;
00555       break;
00556       }
00557 
00558     /* Handle the Perl-compatible cases */
00559 
00560     if (ptr[1] == '{')
00561       {
00562       const uschar *p;
00563       for (p = ptr+2; *p != 0 && *p != '}'; p++)
00564         if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
00565       if (*p != 0 && *p != '}')
00566         {
00567         c = -ESC_k;
00568         break;
00569         }
00570       braced = TRUE;
00571       ptr++;
00572       }
00573     else braced = FALSE;
00574 
00575     if (ptr[1] == '-')
00576       {
00577       negated = TRUE;
00578       ptr++;
00579       }
00580     else negated = FALSE;
00581 
00582     c = 0;
00583     while ((digitab[ptr[1]] & ctype_digit) != 0)
00584       c = c * 10 + *(++ptr) - '0';
00585 
00586     if (c < 0)   /* Integer overflow */
00587       {
00588       *errorcodeptr = ERR61;
00589       break;
00590       }
00591 
00592     if (braced && *(++ptr) != '}')
00593       {
00594       *errorcodeptr = ERR57;
00595       break;
00596       }
00597 
00598     if (c == 0)
00599       {
00600       *errorcodeptr = ERR58;
00601       break;
00602       }
00603 
00604     if (negated)
00605       {
00606       if (c > bracount)
00607         {
00608         *errorcodeptr = ERR15;
00609         break;
00610         }
00611       c = bracount - (c - 1);
00612       }
00613 
00614     c = -(ESC_REF + c);
00615     break;
00616 
00617     /* The handling of escape sequences consisting of a string of digits
00618     starting with one that is not zero is not straightforward. By experiment,
00619     the way Perl works seems to be as follows:
00620 
00621     Outside a character class, the digits are read as a decimal number. If the
00622     number is less than 10, or if there are that many previous extracting
00623     left brackets, then it is a back reference. Otherwise, up to three octal
00624     digits are read to form an escaped byte. Thus \123 is likely to be octal
00625     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
00626     value is greater than 377, the least significant 8 bits are taken. Inside a
00627     character class, \ followed by a digit is always an octal number. */
00628 
00629     case '1': case '2': case '3': case '4': case '5':
00630     case '6': case '7': case '8': case '9':
00631 
00632     if (!isclass)
00633       {
00634       oldptr = ptr;
00635       c -= '0';
00636       while ((digitab[ptr[1]] & ctype_digit) != 0)
00637         c = c * 10 + *(++ptr) - '0';
00638       if (c < 0)    /* Integer overflow */
00639         {
00640         *errorcodeptr = ERR61;
00641         break;
00642         }
00643       if (c < 10 || c <= bracount)
00644         {
00645         c = -(ESC_REF + c);
00646         break;
00647         }
00648       ptr = oldptr;      /* Put the pointer back and fall through */
00649       }
00650 
00651     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
00652     generates a binary zero byte and treats the digit as a following literal.
00653     Thus we have to pull back the pointer by one. */
00654 
00655     if ((c = *ptr) >= '8')
00656       {
00657       ptr--;
00658       c = 0;
00659       break;
00660       }
00661 
00662     /* \0 always starts an octal number, but we may drop through to here with a
00663     larger first octal digit. The original code used just to take the least
00664     significant 8 bits of octal numbers (I think this is what early Perls used
00665     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
00666     than 3 octal digits. */
00667 
00668     case '0':
00669     c -= '0';
00670     while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
00671         c = c * 8 + *(++ptr) - '0';
00672     if (!utf8 && c > 255) *errorcodeptr = ERR51;
00673     break;
00674 
00675     /* \x is complicated. \x{ddd} is a character number which can be greater
00676     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
00677     treated as a data character. */
00678 
00679     case 'x':
00680     if (ptr[1] == '{')
00681       {
00682       const uschar *pt = ptr + 2;
00683       int count = 0;
00684 
00685       c = 0;
00686       while ((digitab[*pt] & ctype_xdigit) != 0)
00687         {
00688         register int cc = *pt++;
00689         if (c == 0 && cc == '0') continue;     /* Leading zeroes */
00690         count++;
00691 
00692 #ifndef EBCDIC  /* ASCII coding */
00693         if (cc >= 'a') cc -= 32;               /* Convert to upper case */
00694         c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
00695 #else           /* EBCDIC coding */
00696         if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */
00697         c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
00698 #endif
00699         }
00700 
00701       if (*pt == '}')
00702         {
00703         if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
00704         ptr = pt;
00705         break;
00706         }
00707 
00708       /* If the sequence of hex digits does not end with '}', then we don't
00709       recognize this construct; fall through to the normal \x handling. */
00710       }
00711 
00712     /* Read just a single-byte hex-defined char */
00713 
00714     c = 0;
00715     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
00716       {
00717       int cc;                               /* Some compilers don't like ++ */
00718       cc = *(++ptr);                        /* in initializers */
00719 #ifndef EBCDIC  /* ASCII coding */
00720       if (cc >= 'a') cc -= 32;              /* Convert to upper case */
00721       c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
00722 #else           /* EBCDIC coding */
00723       if (cc <= 'z') cc += 64;              /* Convert to upper case */
00724       c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
00725 #endif
00726       }
00727     break;
00728 
00729     /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
00730     This coding is ASCII-specific, but then the whole concept of \cx is
00731     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
00732 
00733     case 'c':
00734     c = *(++ptr);
00735     if (c == 0)
00736       {
00737       *errorcodeptr = ERR2;
00738       break;
00739       }
00740 
00741 #ifndef EBCDIC  /* ASCII coding */
00742     if (c >= 'a' && c <= 'z') c -= 32;
00743     c ^= 0x40;
00744 #else           /* EBCDIC coding */
00745     if (c >= 'a' && c <= 'z') c += 64;
00746     c ^= 0xC0;
00747 #endif
00748     break;
00749 
00750     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
00751     other alphanumeric following \ is an error if PCRE_EXTRA was set;
00752     otherwise, for Perl compatibility, it is a literal. This code looks a bit
00753     odd, but there used to be some cases other than the default, and there may
00754     be again in future, so I haven't "optimized" it. */
00755 
00756     default:
00757     if ((options & PCRE_EXTRA) != 0) switch(c)
00758       {
00759       default:
00760       *errorcodeptr = ERR3;
00761       break;
00762       }
00763     break;
00764     }
00765   }
00766 
00767 *ptrptr = ptr;
00768 return c;
00769 }
00770 
00771 
00772 
00773 #ifdef SUPPORT_UCP
00774 /*************************************************
00775 *               Handle \P and \p                 *
00776 *************************************************/
00777 
00778 /* This function is called after \P or \p has been encountered, provided that
00779 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
00780 pointing at the P or p. On exit, it is pointing at the final character of the
00781 escape sequence.
00782 
00783 Argument:
00784   ptrptr         points to the pattern position pointer
00785   negptr         points to a boolean that is set TRUE for negation else FALSE
00786   dptr           points to an int that is set to the detailed property value
00787   errorcodeptr   points to the error code variable
00788 
00789 Returns:         type value from ucp_type_table, or -1 for an invalid type
00790 */
00791 
00792 static int
00793 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
00794 {
00795 int c, i, bot, top;
00796 const uschar *ptr = *ptrptr;
00797 char name[32];
00798 
00799 c = *(++ptr);
00800 if (c == 0) goto ERROR_RETURN;
00801 
00802 *negptr = FALSE;
00803 
00804 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
00805 negation. */
00806 
00807 if (c == '{')
00808   {
00809   if (ptr[1] == '^')
00810     {
00811     *negptr = TRUE;
00812     ptr++;
00813     }
00814   for (i = 0; i < (int)sizeof(name) - 1; i++)
00815     {
00816     c = *(++ptr);
00817     if (c == 0) goto ERROR_RETURN;
00818     if (c == '}') break;
00819     name[i] = c;
00820     }
00821   if (c !='}') goto ERROR_RETURN;
00822   name[i] = 0;
00823   }
00824 
00825 /* Otherwise there is just one following character */
00826 
00827 else
00828   {
00829   name[0] = c;
00830   name[1] = 0;
00831   }
00832 
00833 *ptrptr = ptr;
00834 
00835 /* Search for a recognized property name using binary chop */
00836 
00837 bot = 0;
00838 top = _pcre_utt_size;
00839 
00840 while (bot < top)
00841   {
00842   i = (bot + top) >> 1;
00843   c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
00844   if (c == 0)
00845     {
00846     *dptr = _pcre_utt[i].value;
00847     return _pcre_utt[i].type;
00848     }
00849   if (c > 0) bot = i + 1; else top = i;
00850   }
00851 
00852 *errorcodeptr = ERR47;
00853 *ptrptr = ptr;
00854 return -1;
00855 
00856 ERROR_RETURN:
00857 *errorcodeptr = ERR46;
00858 *ptrptr = ptr;
00859 return -1;
00860 }
00861 #endif
00862 
00863 
00864 
00865 
00866 /*************************************************
00867 *            Check for counted repeat            *
00868 *************************************************/
00869 
00870 /* This function is called when a '{' is encountered in a place where it might
00871 start a quantifier. It looks ahead to see if it really is a quantifier or not.
00872 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
00873 where the ddds are digits.
00874 
00875 Arguments:
00876   p         pointer to the first char after '{'
00877 
00878 Returns:    TRUE or FALSE
00879 */
00880 
00881 static BOOL
00882 is_counted_repeat(const uschar *p)
00883 {
00884 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
00885 while ((digitab[*p] & ctype_digit) != 0) p++;
00886 if (*p == '}') return TRUE;
00887 
00888 if (*p++ != ',') return FALSE;
00889 if (*p == '}') return TRUE;
00890 
00891 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
00892 while ((digitab[*p] & ctype_digit) != 0) p++;
00893 
00894 return (*p == '}');
00895 }
00896 
00897 
00898 
00899 /*************************************************
00900 *         Read repeat counts                     *
00901 *************************************************/
00902 
00903 /* Read an item of the form {n,m} and return the values. This is called only
00904 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
00905 so the syntax is guaranteed to be correct, but we need to check the values.
00906 
00907 Arguments:
00908   p              pointer to first char after '{'
00909   minp           pointer to int for min
00910   maxp           pointer to int for max
00911                  returned as -1 if no max
00912   errorcodeptr   points to error code variable
00913 
00914 Returns:         pointer to '}' on success;
00915                  current ptr on error, with errorcodeptr set non-zero
00916 */
00917 
00918 static const uschar *
00919 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
00920 {
00921 int min = 0;
00922 int max = -1;
00923 
00924 /* Read the minimum value and do a paranoid check: a negative value indicates
00925 an integer overflow. */
00926 
00927 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
00928 if (min < 0 || min > 65535)
00929   {
00930   *errorcodeptr = ERR5;
00931   return p;
00932   }
00933 
00934 /* Read the maximum value if there is one, and again do a paranoid on its size.
00935 Also, max must not be less than min. */
00936 
00937 if (*p == '}') max = min; else
00938   {
00939   if (*(++p) != '}')
00940     {
00941     max = 0;
00942     while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
00943     if (max < 0 || max > 65535)
00944       {
00945       *errorcodeptr = ERR5;
00946       return p;
00947       }
00948     if (max < min)
00949       {
00950       *errorcodeptr = ERR4;
00951       return p;
00952       }
00953     }
00954   }
00955 
00956 /* Fill in the required variables, and pass back the pointer to the terminating
00957 '}'. */
00958 
00959 *minp = min;
00960 *maxp = max;
00961 return p;
00962 }
00963 
00964 
00965 
00966 /*************************************************
00967 *       Find forward referenced subpattern       *
00968 *************************************************/
00969 
00970 /* This function scans along a pattern's text looking for capturing
00971 subpatterns, and counting them. If it finds a named pattern that matches the
00972 name it is given, it returns its number. Alternatively, if the name is NULL, it
00973 returns when it reaches a given numbered subpattern. This is used for forward
00974 references to subpatterns. We know that if (?P< is encountered, the name will
00975 be terminated by '>' because that is checked in the first pass.
00976 
00977 Arguments:
00978   ptr          current position in the pattern
00979   cd           compile background data
00980   name         name to seek, or NULL if seeking a numbered subpattern
00981   lorn         name length, or subpattern number if name is NULL
00982   xmode        TRUE if we are in /x mode
00983 
00984 Returns:       the number of the named subpattern, or -1 if not found
00985 */
00986 
00987 static int
00988 find_parens(const uschar *ptr, compile_data *cd, const uschar *name, int lorn,
00989   BOOL xmode)
00990 {
00991 const uschar *thisname;
00992 int count = cd->bracount;
00993 
00994 for (; *ptr != 0; ptr++)
00995   {
00996   int term;
00997 
00998   /* Skip over backslashed characters and also entire \Q...\E */
00999 
01000   if (*ptr == '\\')
01001     {
01002     if (*(++ptr) == 0) return -1;
01003     if (*ptr == 'Q') for (;;)
01004       {
01005       while (*(++ptr) != 0 && *ptr != '\\') {};
01006       if (*ptr == 0) return -1;
01007       if (*(++ptr) == 'E') break;
01008       }
01009     continue;
01010     }
01011 
01012   /* Skip over character classes; this logic must be similar to the way they
01013   are handled for real. If the first character is '^', skip it. Also, if the
01014   first few characters (either before or after ^) are \Q\E or \E we skip them
01015   too. This makes for compatibility with Perl. */
01016 
01017   if (*ptr == '[')
01018     {
01019     BOOL negate_class = FALSE;
01020     for (;;)
01021       {
01022       int c = *(++ptr);
01023       if (c == '\\')
01024         {
01025         if (ptr[1] == 'E') ptr++;
01026           else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
01027             else break;
01028         }
01029       else if (!negate_class && c == '^')
01030         negate_class = TRUE;
01031       else break;
01032       }
01033 
01034     /* If the next character is ']', it is a data character that must be
01035     skipped, except in JavaScript compatibility mode. */
01036 
01037     if (ptr[1] == ']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
01038       ptr++;
01039 
01040     while (*(++ptr) != ']')
01041       {
01042       if (*ptr == 0) return -1;
01043       if (*ptr == '\\')
01044         {
01045         if (*(++ptr) == 0) return -1;
01046         if (*ptr == 'Q') for (;;)
01047           {
01048           while (*(++ptr) != 0 && *ptr != '\\') {};
01049           if (*ptr == 0) return -1;
01050           if (*(++ptr) == 'E') break;
01051           }
01052         continue;
01053         }
01054       }
01055     continue;
01056     }
01057 
01058   /* Skip comments in /x mode */
01059 
01060   if (xmode && *ptr == '#')
01061     {
01062     while (*(++ptr) != 0 && *ptr != '\n') {};
01063     if (*ptr == 0) return -1;
01064     continue;
01065     }
01066 
01067   /* An opening parens must now be a real metacharacter */
01068 
01069   if (*ptr != '(') continue;
01070   if (ptr[1] != '?' && ptr[1] != '*')
01071     {
01072     count++;
01073     if (name == NULL && count == lorn) return count;
01074     continue;
01075     }
01076 
01077   ptr += 2;
01078   if (*ptr == 'P') ptr++;                      /* Allow optional P */
01079 
01080   /* We have to disambiguate (?<! and (?<= from (?<name> */
01081 
01082   if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
01083        *ptr != '\'')
01084     continue;
01085 
01086   count++;
01087 
01088   if (name == NULL && count == lorn) return count;
01089   term = *ptr++;
01090   if (term == '<') term = '>';
01091   thisname = ptr;
01092   while (*ptr != term) ptr++;
01093   if (name != NULL && lorn == ptr - thisname &&
01094       strncmp((const char *)name, (const char *)thisname, lorn) == 0)
01095     return count;
01096   }
01097 
01098 return -1;
01099 }
01100 
01101 
01102 
01103 /*************************************************
01104 *      Find first significant op code            *
01105 *************************************************/
01106 
01107 /* This is called by several functions that scan a compiled expression looking
01108 for a fixed first character, or an anchoring op code etc. It skips over things
01109 that do not influence this. For some calls, a change of option is important.
01110 For some calls, it makes sense to skip negative forward and all backward
01111 assertions, and also the \b assertion; for others it does not.
01112 
01113 Arguments:
01114   code         pointer to the start of the group
01115   options      pointer to external options
01116   optbit       the option bit whose changing is significant, or
01117                  zero if none are
01118   skipassert   TRUE if certain assertions are to be skipped
01119 
01120 Returns:       pointer to the first significant opcode
01121 */
01122 
01123 static const uschar*
01124 first_significant_code(const uschar *code, int *options, int optbit,
01125   BOOL skipassert)
01126 {
01127 for (;;)
01128   {
01129   switch ((int)*code)
01130     {
01131     case OP_OPT:
01132     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
01133       *options = (int)code[1];
01134     code += 2;
01135     break;
01136 
01137     case OP_ASSERT_NOT:
01138     case OP_ASSERTBACK:
01139     case OP_ASSERTBACK_NOT:
01140     if (!skipassert) return code;
01141     do code += GET(code, 1); while (*code == OP_ALT);
01142     code += _pcre_OP_lengths[*code];
01143     break;
01144 
01145     case OP_WORD_BOUNDARY:
01146     case OP_NOT_WORD_BOUNDARY:
01147     if (!skipassert) return code;
01148     /* Fall through */
01149 
01150     case OP_CALLOUT:
01151     case OP_CREF:
01152     case OP_RREF:
01153     case OP_DEF:
01154     code += _pcre_OP_lengths[*code];
01155     break;
01156 
01157     default:
01158     return code;
01159     }
01160   }
01161 /* Control never reaches here */
01162 }
01163 
01164 
01165 
01166 
01167 /*************************************************
01168 *        Find the fixed length of a pattern      *
01169 *************************************************/
01170 
01171 /* Scan a pattern and compute the fixed length of subject that will match it,
01172 if the length is fixed. This is needed for dealing with backward assertions.
01173 In UTF8 mode, the result is in characters rather than bytes.
01174 
01175 Arguments:
01176   code     points to the start of the pattern (the bracket)
01177   options  the compiling options
01178 
01179 Returns:   the fixed length, or -1 if there is no fixed length,
01180              or -2 if \C was encountered
01181 */
01182 
01183 static int
01184 find_fixedlength(uschar *code, int options)
01185 {
01186 int length = -1;
01187 
01188 register int branchlength = 0;
01189 register uschar *cc = code + 1 + LINK_SIZE;
01190 
01191 /* Scan along the opcodes for this branch. If we get to the end of the
01192 branch, check the length against that of the other branches. */
01193 
01194 for (;;)
01195   {
01196   int d;
01197   register int op = *cc;
01198   switch (op)
01199     {
01200     case OP_CBRA:
01201     case OP_BRA:
01202     case OP_ONCE:
01203     case OP_COND:
01204     d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
01205     if (d < 0) return d;
01206     branchlength += d;
01207     do cc += GET(cc, 1); while (*cc == OP_ALT);
01208     cc += 1 + LINK_SIZE;
01209     break;
01210 
01211     /* Reached end of a branch; if it's a ket it is the end of a nested
01212     call. If it's ALT it is an alternation in a nested call. If it is
01213     END it's the end of the outer call. All can be handled by the same code. */
01214 
01215     case OP_ALT:
01216     case OP_KET:
01217     case OP_KETRMAX:
01218     case OP_KETRMIN:
01219     case OP_END:
01220     if (length < 0) length = branchlength;
01221       else if (length != branchlength) return -1;
01222     if (*cc != OP_ALT) return length;
01223     cc += 1 + LINK_SIZE;
01224     branchlength = 0;
01225     break;
01226 
01227     /* Skip over assertive subpatterns */
01228 
01229     case OP_ASSERT:
01230     case OP_ASSERT_NOT:
01231     case OP_ASSERTBACK:
01232     case OP_ASSERTBACK_NOT:
01233     do cc += GET(cc, 1); while (*cc == OP_ALT);
01234     /* Fall through */
01235 
01236     /* Skip over things that don't match chars */
01237 
01238     case OP_REVERSE:
01239     case OP_CREF:
01240     case OP_RREF:
01241     case OP_DEF:
01242     case OP_OPT:
01243     case OP_CALLOUT:
01244     case OP_SOD:
01245     case OP_SOM:
01246     case OP_EOD:
01247     case OP_EODN:
01248     case OP_CIRC:
01249     case OP_DOLL:
01250     case OP_NOT_WORD_BOUNDARY:
01251     case OP_WORD_BOUNDARY:
01252     cc += _pcre_OP_lengths[*cc];
01253     break;
01254 
01255     /* Handle literal characters */
01256 
01257     case OP_CHAR:
01258     case OP_CHARNC:
01259     case OP_NOT:
01260     branchlength++;
01261     cc += 2;
01262 #ifdef SUPPORT_UTF8
01263     if ((options & PCRE_UTF8) != 0)
01264       {
01265       while ((*cc & 0xc0) == 0x80) cc++;
01266       }
01267 #endif
01268     break;
01269 
01270     /* Handle exact repetitions. The count is already in characters, but we
01271     need to skip over a multibyte character in UTF8 mode.  */
01272 
01273     case OP_EXACT:
01274     branchlength += GET2(cc,1);
01275     cc += 4;
01276 #ifdef SUPPORT_UTF8
01277     if ((options & PCRE_UTF8) != 0)
01278       {
01279       while((*cc & 0x80) == 0x80) cc++;
01280       }
01281 #endif
01282     break;
01283 
01284     case OP_TYPEEXACT:
01285     branchlength += GET2(cc,1);
01286     if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
01287     cc += 4;
01288     break;
01289 
01290     /* Handle single-char matchers */
01291 
01292     case OP_PROP:
01293     case OP_NOTPROP:
01294     cc += 2;
01295     /* Fall through */
01296 
01297     case OP_NOT_DIGIT:
01298     case OP_DIGIT:
01299     case OP_NOT_WHITESPACE:
01300     case OP_WHITESPACE:
01301     case OP_NOT_WORDCHAR:
01302     case OP_WORDCHAR:
01303     case OP_ANY:
01304     case OP_ALLANY:
01305     branchlength++;
01306     cc++;
01307     break;
01308 
01309     /* The single-byte matcher isn't allowed */
01310 
01311     case OP_ANYBYTE:
01312     return -2;
01313 
01314     /* Check a class for variable quantification */
01315 
01316 #ifdef SUPPORT_UTF8
01317     case OP_XCLASS:
01318     cc += GET(cc, 1) - 33;
01319     /* Fall through */
01320 #endif
01321 
01322     case OP_CLASS:
01323     case OP_NCLASS:
01324     cc += 33;
01325 
01326     switch (*cc)
01327       {
01328       case OP_CRSTAR:
01329       case OP_CRMINSTAR:
01330       case OP_CRQUERY:
01331       case OP_CRMINQUERY:
01332       return -1;
01333 
01334       case OP_CRRANGE:
01335       case OP_CRMINRANGE:
01336       if (GET2(cc,1) != GET2(cc,3)) return -1;
01337       branchlength += GET2(cc,1);
01338       cc += 5;
01339       break;
01340 
01341       default:
01342       branchlength++;
01343       }
01344     break;
01345 
01346     /* Anything else is variable length */
01347 
01348     default:
01349     return -1;
01350     }
01351   }
01352 /* Control never gets here */
01353 }
01354 
01355 
01356 
01357 
01358 /*************************************************
01359 *    Scan compiled regex for numbered bracket    *
01360 *************************************************/
01361 
01362 /* This little function scans through a compiled pattern until it finds a
01363 capturing bracket with the given number.
01364 
01365 Arguments:
01366   code        points to start of expression
01367   utf8        TRUE in UTF-8 mode
01368   number      the required bracket number
01369 
01370 Returns:      pointer to the opcode for the bracket, or NULL if not found
01371 */
01372 
01373 static const uschar *
01374 find_bracket(const uschar *code, BOOL utf8, int number)
01375 {
01376 for (;;)
01377   {
01378   register int c = *code;
01379   if (c == OP_END) return NULL;
01380 
01381   /* XCLASS is used for classes that cannot be represented just by a bit
01382   map. This includes negated single high-valued characters. The length in
01383   the table is zero; the actual length is stored in the compiled code. */
01384 
01385   if (c == OP_XCLASS) code += GET(code, 1);
01386 
01387   /* Handle capturing bracket */
01388 
01389   else if (c == OP_CBRA)
01390     {
01391     int n = GET2(code, 1+LINK_SIZE);
01392     if (n == number) return (uschar *)code;
01393     code += _pcre_OP_lengths[c];
01394     }
01395 
01396   /* Otherwise, we can get the item's length from the table, except that for
01397   repeated character types, we have to test for \p and \P, which have an extra
01398   two bytes of parameters. */
01399 
01400   else
01401     {
01402     switch(c)
01403       {
01404       case OP_TYPESTAR:
01405       case OP_TYPEMINSTAR:
01406       case OP_TYPEPLUS:
01407       case OP_TYPEMINPLUS:
01408       case OP_TYPEQUERY:
01409       case OP_TYPEMINQUERY:
01410       case OP_TYPEPOSSTAR:
01411       case OP_TYPEPOSPLUS:
01412       case OP_TYPEPOSQUERY:
01413       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
01414       break;
01415 
01416       case OP_TYPEUPTO:
01417       case OP_TYPEMINUPTO:
01418       case OP_TYPEEXACT:
01419       case OP_TYPEPOSUPTO:
01420       if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
01421       break;
01422       }
01423 
01424     /* Add in the fixed length from the table */
01425 
01426     code += _pcre_OP_lengths[c];
01427 
01428   /* In UTF-8 mode, opcodes that are followed by a character may be followed by
01429   a multi-byte character. The length in the table is a minimum, so we have to
01430   arrange to skip the extra bytes. */
01431 
01432 #ifdef SUPPORT_UTF8
01433     if (utf8) switch(c)
01434       {
01435       case OP_CHAR:
01436       case OP_CHARNC:
01437       case OP_EXACT:
01438       case OP_UPTO:
01439       case OP_MINUPTO:
01440       case OP_POSUPTO:
01441       case OP_STAR:
01442       case OP_MINSTAR:
01443       case OP_POSSTAR:
01444       case OP_PLUS:
01445       case OP_MINPLUS:
01446       case OP_POSPLUS:
01447       case OP_QUERY:
01448       case OP_MINQUERY:
01449       case OP_POSQUERY:
01450       if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
01451       break;
01452       }
01453 #else
01454     (void)(utf8);  /* Keep compiler happy by referencing function argument */
01455 #endif
01456     }
01457   }
01458 }
01459 
01460 
01461 
01462 /*************************************************
01463 *   Scan compiled regex for recursion reference  *
01464 *************************************************/
01465 
01466 /* This little function scans through a compiled pattern until it finds an
01467 instance of OP_RECURSE.
01468 
01469 Arguments:
01470   code        points to start of expression
01471   utf8        TRUE in UTF-8 mode
01472 
01473 Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
01474 */
01475 
01476 static const uschar *
01477 find_recurse(const uschar *code, BOOL utf8)
01478 {
01479 for (;;)
01480   {
01481   register int c = *code;
01482   if (c == OP_END) return NULL;
01483   if (c == OP_RECURSE) return code;
01484 
01485   /* XCLASS is used for classes that cannot be represented just by a bit
01486   map. This includes negated single high-valued characters. The length in
01487   the table is zero; the actual length is stored in the compiled code. */
01488 
01489   if (c == OP_XCLASS) code += GET(code, 1);
01490 
01491   /* Otherwise, we can get the item's length from the table, except that for
01492   repeated character types, we have to test for \p and \P, which have an extra
01493   two bytes of parameters. */
01494 
01495   else
01496     {
01497     switch(c)
01498       {
01499       case OP_TYPESTAR:
01500       case OP_TYPEMINSTAR:
01501       case OP_TYPEPLUS:
01502       case OP_TYPEMINPLUS:
01503       case OP_TYPEQUERY:
01504       case OP_TYPEMINQUERY:
01505       case OP_TYPEPOSSTAR:
01506       case OP_TYPEPOSPLUS:
01507       case OP_TYPEPOSQUERY:
01508       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
01509       break;
01510 
01511       case OP_TYPEPOSUPTO:
01512       case OP_TYPEUPTO:
01513       case OP_TYPEMINUPTO:
01514       case OP_TYPEEXACT:
01515       if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
01516       break;
01517       }
01518 
01519     /* Add in the fixed length from the table */
01520 
01521     code += _pcre_OP_lengths[c];
01522 
01523     /* In UTF-8 mode, opcodes that are followed by a character may be followed
01524     by a multi-byte character. The length in the table is a minimum, so we have
01525     to arrange to skip the extra bytes. */
01526 
01527 #ifdef SUPPORT_UTF8
01528     if (utf8) switch(c)
01529       {
01530       case OP_CHAR:
01531       case OP_CHARNC:
01532       case OP_EXACT:
01533       case OP_UPTO:
01534       case OP_MINUPTO:
01535       case OP_POSUPTO:
01536       case OP_STAR:
01537       case OP_MINSTAR:
01538       case OP_POSSTAR:
01539       case OP_PLUS:
01540       case OP_MINPLUS:
01541       case OP_POSPLUS:
01542       case OP_QUERY:
01543       case OP_MINQUERY:
01544       case OP_POSQUERY:
01545       if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
01546       break;
01547       }
01548 #else
01549     (void)(utf8);  /* Keep compiler happy by referencing function argument */
01550 #endif
01551     }
01552   }
01553 }
01554 
01555 
01556 
01557 /*************************************************
01558 *    Scan compiled branch for non-emptiness      *
01559 *************************************************/
01560 
01561 /* This function scans through a branch of a compiled pattern to see whether it
01562 can match the empty string or not. It is called from could_be_empty()
01563 below and from compile_branch() when checking for an unlimited repeat of a
01564 group that can match nothing. Note that first_significant_code() skips over
01565 backward and negative forward assertions when its final argument is TRUE. If we
01566 hit an unclosed bracket, we return "empty" - this means we've struck an inner
01567 bracket whose current branch will already have been scanned.
01568 
01569 Arguments:
01570   code        points to start of search
01571   endcode     points to where to stop
01572   utf8        TRUE if in UTF8 mode
01573 
01574 Returns:      TRUE if what is matched could be empty
01575 */
01576 
01577 static BOOL
01578 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
01579 {
01580 register int c;
01581 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
01582      code < endcode;
01583      code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
01584   {
01585   const uschar *ccode;
01586 
01587   c = *code;
01588 
01589   /* Skip over forward assertions; the other assertions are skipped by
01590   first_significant_code() with a TRUE final argument. */
01591 
01592   if (c == OP_ASSERT)
01593     {
01594     do code += GET(code, 1); while (*code == OP_ALT);
01595     c = *code;
01596     continue;
01597     }
01598 
01599   /* Groups with zero repeats can of course be empty; skip them. */
01600 
01601   if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
01602     {
01603     code += _pcre_OP_lengths[c];
01604     do code += GET(code, 1); while (*code == OP_ALT);
01605     c = *code;
01606     continue;
01607     }
01608 
01609   /* For other groups, scan the branches. */
01610 
01611   if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
01612     {
01613     BOOL empty_branch;
01614     if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
01615 
01616     /* Scan a closed bracket */
01617 
01618     empty_branch = FALSE;
01619     do
01620       {
01621       if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
01622         empty_branch = TRUE;
01623       code += GET(code, 1);
01624       }
01625     while (*code == OP_ALT);
01626     if (!empty_branch) return FALSE;   /* All branches are non-empty */
01627     c = *code;
01628     continue;
01629     }
01630 
01631   /* Handle the other opcodes */
01632 
01633   switch (c)
01634     {
01635     /* Check for quantifiers after a class. XCLASS is used for classes that
01636     cannot be represented just by a bit map. This includes negated single
01637     high-valued characters. The length in _pcre_OP_lengths[] is zero; the
01638     actual length is stored in the compiled code, so we must update "code"
01639     here. */
01640 
01641 #ifdef SUPPORT_UTF8
01642     case OP_XCLASS:
01643     ccode = code += GET(code, 1);
01644     goto CHECK_CLASS_REPEAT;
01645 #endif
01646 
01647     case OP_CLASS:
01648     case OP_NCLASS:
01649     ccode = code + 33;
01650 
01651 #ifdef SUPPORT_UTF8
01652     CHECK_CLASS_REPEAT:
01653 #endif
01654 
01655     switch (*ccode)
01656       {
01657       case OP_CRSTAR:            /* These could be empty; continue */
01658       case OP_CRMINSTAR:
01659       case OP_CRQUERY:
01660       case OP_CRMINQUERY:
01661       break;
01662 
01663       default:                   /* Non-repeat => class must match */
01664       case OP_CRPLUS:            /* These repeats aren't empty */
01665       case OP_CRMINPLUS:
01666       return FALSE;
01667 
01668       case OP_CRRANGE:
01669       case OP_CRMINRANGE:
01670       if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
01671       break;
01672       }
01673     break;
01674 
01675     /* Opcodes that must match a character */
01676 
01677     case OP_PROP:
01678     case OP_NOTPROP:
01679     case OP_EXTUNI:
01680     case OP_NOT_DIGIT:
01681     case OP_DIGIT:
01682     case OP_NOT_WHITESPACE:
01683     case OP_WHITESPACE:
01684     case OP_NOT_WORDCHAR:
01685     case OP_WORDCHAR:
01686     case OP_ANY:
01687     case OP_ALLANY:
01688     case OP_ANYBYTE:
01689     case OP_CHAR:
01690     case OP_CHARNC:
01691     case OP_NOT:
01692     case OP_PLUS:
01693     case OP_MINPLUS:
01694     case OP_POSPLUS:
01695     case OP_EXACT:
01696     case OP_NOTPLUS:
01697     case OP_NOTMINPLUS:
01698     case OP_NOTPOSPLUS:
01699     case OP_NOTEXACT:
01700     case OP_TYPEPLUS:
01701     case OP_TYPEMINPLUS:
01702     case OP_TYPEPOSPLUS:
01703     case OP_TYPEEXACT:
01704     return FALSE;
01705 
01706     /* These are going to continue, as they may be empty, but we have to
01707     fudge the length for the \p and \P cases. */
01708 
01709     case OP_TYPESTAR:
01710     case OP_TYPEMINSTAR:
01711     case OP_TYPEPOSSTAR:
01712     case OP_TYPEQUERY:
01713     case OP_TYPEMINQUERY:
01714     case OP_TYPEPOSQUERY:
01715     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
01716     break;
01717 
01718     /* Same for these */
01719 
01720     case OP_TYPEUPTO:
01721     case OP_TYPEMINUPTO:
01722     case OP_TYPEPOSUPTO:
01723     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
01724     break;
01725 
01726     /* End of branch */
01727 
01728     case OP_KET:
01729     case OP_KETRMAX:
01730     case OP_KETRMIN:
01731     case OP_ALT:
01732     return TRUE;
01733 
01734     /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
01735     MINUPTO, and POSUPTO may be followed by a multibyte character */
01736 
01737 #ifdef SUPPORT_UTF8
01738     case OP_STAR:
01739     case OP_MINSTAR:
01740     case OP_POSSTAR:
01741     case OP_QUERY:
01742     case OP_MINQUERY:
01743     case OP_POSQUERY:
01744     case OP_UPTO:
01745     case OP_MINUPTO:
01746     case OP_POSUPTO:
01747     if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
01748     break;
01749 #endif
01750     }
01751   }
01752 
01753 return TRUE;
01754 }
01755 
01756 
01757 
01758 /*************************************************
01759 *    Scan compiled regex for non-emptiness       *
01760 *************************************************/
01761 
01762 /* This function is called to check for left recursive calls. We want to check
01763 the current branch of the current pattern to see if it could match the empty
01764 string. If it could, we must look outwards for branches at other levels,
01765 stopping when we pass beyond the bracket which is the subject of the recursion.
01766 
01767 Arguments:
01768   code        points to start of the recursion
01769   endcode     points to where to stop (current RECURSE item)
01770   bcptr       points to the chain of current (unclosed) branch starts
01771   utf8        TRUE if in UTF-8 mode
01772 
01773 Returns:      TRUE if what is matched could be empty
01774 */
01775 
01776 static BOOL
01777 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
01778   BOOL utf8)
01779 {
01780 while (bcptr != NULL && bcptr->current >= code)
01781   {
01782   if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
01783   bcptr = bcptr->outer;
01784   }
01785 return TRUE;
01786 }
01787 
01788 
01789 
01790 /*************************************************
01791 *           Check for POSIX class syntax         *
01792 *************************************************/
01793 
01794 /* This function is called when the sequence "[:" or "[." or "[=" is
01795 encountered in a character class. It checks whether this is followed by a
01796 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
01797 reach an unescaped ']' without the special preceding character, return FALSE.
01798 
01799 Originally, this function only recognized a sequence of letters between the
01800 terminators, but it seems that Perl recognizes any sequence of characters,
01801 though of course unknown POSIX names are subsequently rejected. Perl gives an
01802 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
01803 didn't consider this to be a POSIX class. Likewise for [:1234:].
01804 
01805 The problem in trying to be exactly like Perl is in the handling of escapes. We
01806 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
01807 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
01808 below handles the special case of \], but does not try to do any other escape
01809 processing. This makes it different from Perl for cases such as [:l\ower:]
01810 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
01811 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
01812 I think.
01813 
01814 Arguments:
01815   ptr      pointer to the initial [
01816   endptr   where to return the end pointer
01817 
01818 Returns:   TRUE or FALSE
01819 */
01820 
01821 static BOOL
01822 check_posix_syntax(const uschar *ptr, const uschar **endptr)
01823 {
01824 int terminator;          /* Don't combine these lines; the Solaris cc */
01825 terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
01826 for (++ptr; *ptr != 0; ptr++)
01827   {
01828   if (*ptr == '\\' && ptr[1] == ']') ptr++; else
01829     {
01830     if (*ptr == ']') return FALSE;
01831     if (*ptr == terminator && ptr[1] == ']')
01832       {
01833       *endptr = ptr;
01834       return TRUE;
01835       }
01836     }
01837   }
01838 return FALSE;
01839 }
01840 
01841 
01842 
01843 
01844 /*************************************************
01845 *          Check POSIX class name                *
01846 *************************************************/
01847 
01848 /* This function is called to check the name given in a POSIX-style class entry
01849 such as [:alnum:].
01850 
01851 Arguments:
01852   ptr        points to the first letter
01853   len        the length of the name
01854 
01855 Returns:     a value representing the name, or -1 if unknown
01856 */
01857 
01858 static int
01859 check_posix_name(const uschar *ptr, int len)
01860 {
01861 const char *pn = posix_names;
01862 register int yield = 0;
01863 while (posix_name_lengths[yield] != 0)
01864   {
01865   if (len == posix_name_lengths[yield] &&
01866     strncmp((const char *)ptr, pn, len) == 0) return yield;
01867   pn += posix_name_lengths[yield] + 1;
01868   yield++;
01869   }
01870 return -1;
01871 }
01872 
01873 
01874 /*************************************************
01875 *    Adjust OP_RECURSE items in repeated group   *
01876 *************************************************/
01877 
01878 /* OP_RECURSE items contain an offset from the start of the regex to the group
01879 that is referenced. This means that groups can be replicated for fixed
01880 repetition simply by copying (because the recursion is allowed to refer to
01881 earlier groups that are outside the current group). However, when a group is
01882 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
01883 inserted before it, after it has been compiled. This means that any OP_RECURSE
01884 items within it that refer to the group itself or any contained groups have to
01885 have their offsets adjusted. That one of the jobs of this function. Before it
01886 is called, the partially compiled regex must be temporarily terminated with
01887 OP_END.
01888 
01889 This function has been extended with the possibility of forward references for
01890 recursions and subroutine calls. It must also check the list of such references
01891 for the group we are dealing with. If it finds that one of the recursions in
01892 the current group is on this list, it adjusts the offset in the list, not the
01893 value in the reference (which is a group number).
01894 
01895 Arguments:
01896   group      points to the start of the group
01897   adjust     the amount by which the group is to be moved
01898   utf8       TRUE in UTF-8 mode
01899   cd         contains pointers to tables etc.
01900   save_hwm   the hwm forward reference pointer at the start of the group
01901 
01902 Returns:     nothing
01903 */
01904 
01905 static void
01906 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
01907   uschar *save_hwm)
01908 {
01909 uschar *ptr = group;
01910 
01911 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
01912   {
01913   int offset;
01914   uschar *hc;
01915 
01916   /* See if this recursion is on the forward reference list. If so, adjust the
01917   reference. */
01918 
01919   for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
01920     {
01921     offset = GET(hc, 0);
01922     if (cd->start_code + offset == ptr + 1)
01923       {
01924       PUT(hc, 0, offset + adjust);
01925       break;
01926       }
01927     }
01928 
01929   /* Otherwise, adjust the recursion offset if it's after the start of this
01930   group. */
01931 
01932   if (hc >= cd->hwm)
01933     {
01934     offset = GET(ptr, 1);
01935     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
01936     }
01937 
01938   ptr += 1 + LINK_SIZE;
01939   }
01940 }
01941 
01942 
01943 
01944 /*************************************************
01945 *        Insert an automatic callout point       *
01946 *************************************************/
01947 
01948 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
01949 callout points before each pattern item.
01950 
01951 Arguments:
01952   code           current code pointer
01953   ptr            current pattern pointer
01954   cd             pointers to tables etc
01955 
01956 Returns:         new code pointer
01957 */
01958 
01959 static uschar *
01960 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
01961 {
01962 *code++ = OP_CALLOUT;
01963 *code++ = 255;
01964 PUT(code, 0, ptr - cd->start_pattern);  /* Pattern offset */
01965 PUT(code, LINK_SIZE, 0);                /* Default length */
01966 return code + 2*LINK_SIZE;
01967 }
01968 
01969 
01970 
01971 /*************************************************
01972 *         Complete a callout item                *
01973 *************************************************/
01974 
01975 /* A callout item contains the length of the next item in the pattern, which
01976 we can't fill in till after we have reached the relevant point. This is used
01977 for both automatic and manual callouts.
01978 
01979 Arguments:
01980   previous_callout   points to previous callout item
01981   ptr                current pattern pointer
01982   cd                 pointers to tables etc
01983 
01984 Returns:             nothing
01985 */
01986 
01987 static void
01988 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
01989 {
01990 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
01991 PUT(previous_callout, 2 + LINK_SIZE, length);
01992 }
01993 
01994 
01995 
01996 #ifdef SUPPORT_UCP
01997 /*************************************************
01998 *           Get othercase range                  *
01999 *************************************************/
02000 
02001 /* This function is passed the start and end of a class range, in UTF-8 mode
02002 with UCP support. It searches up the characters, looking for internal ranges of
02003 characters in the "other" case. Each call returns the next one, updating the
02004 start address.
02005 
02006 Arguments:
02007   cptr        points to starting character value; updated
02008   d           end value
02009   ocptr       where to put start of othercase range
02010   odptr       where to put end of othercase range
02011 
02012 Yield:        TRUE when range returned; FALSE when no more
02013 */
02014 
02015 static BOOL
02016 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
02017   unsigned int *odptr)
02018 {
02019 unsigned int c, othercase, next;
02020 
02021 for (c = *cptr; c <= d; c++)
02022   { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
02023 
02024 if (c > d) return FALSE;
02025 
02026 *ocptr = othercase;
02027 next = othercase + 1;
02028 
02029 for (++c; c <= d; c++)
02030   {
02031   if (UCD_OTHERCASE(c) != next) break;
02032   next++;
02033   }
02034 
02035 *odptr = next - 1;
02036 *cptr = c;
02037 
02038 return TRUE;
02039 }
02040 #endif  /* SUPPORT_UCP */
02041 
02042 
02043 
02044 /*************************************************
02045 *     Check if auto-possessifying is possible    *
02046 *************************************************/
02047 
02048 /* This function is called for unlimited repeats of certain items, to see
02049 whether the next thing could possibly match the repeated item. If not, it makes
02050 sense to automatically possessify the repeated item.
02051 
02052 Arguments:
02053   op_code       the repeated op code
02054   this          data for this item, depends on the opcode
02055   utf8          TRUE in UTF-8 mode
02056   utf8_char     used for utf8 character bytes, NULL if not relevant
02057   ptr           next character in pattern
02058   options       options bits
02059   cd            contains pointers to tables etc.
02060 
02061 Returns:        TRUE if possessifying is wanted
02062 */
02063 
02064 static BOOL
02065 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
02066   const uschar *ptr, int options, compile_data *cd)
02067 {
02068 int next;
02069 
02070 /* Skip whitespace and comments in extended mode */
02071 
02072 if ((options & PCRE_EXTENDED) != 0)
02073   {
02074   for (;;)
02075     {
02076     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
02077     if (*ptr == '#')
02078       {
02079       while (*(++ptr) != 0)
02080         if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
02081       }
02082     else break;
02083     }
02084   }
02085 
02086 /* If the next item is one that we can handle, get its value. A non-negative
02087 value is a character, a negative value is an escape value. */
02088 
02089 if (*ptr == '\\')
02090   {
02091   int temperrorcode = 0;
02092   next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
02093   if (temperrorcode != 0) return FALSE;
02094   ptr++;    /* Point after the escape sequence */
02095   }
02096 
02097 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
02098   {
02099 #ifdef SUPPORT_UTF8
02100   if (utf8) { GETCHARINC(next, ptr); } else
02101 #endif
02102   next = *ptr++;
02103   }
02104 
02105 else return FALSE;
02106 
02107 /* Skip whitespace and comments in extended mode */
02108 
02109 if ((options & PCRE_EXTENDED) != 0)
02110   {
02111   for (;;)
02112     {
02113     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
02114     if (*ptr == '#')
02115       {
02116       while (*(++ptr) != 0)
02117         if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
02118       }
02119     else break;
02120     }
02121   }
02122 
02123 /* If the next thing is itself optional, we have to give up. */
02124 
02125 if (*ptr == '*' || *ptr == '?' || strncmp((char *)ptr, "{0,", 3) == 0)
02126   return FALSE;
02127 
02128 /* Now compare the next item with the previous opcode. If the previous is a
02129 positive single character match, "item" either contains the character or, if
02130 "item" is greater than 127 in utf8 mode, the character's bytes are in
02131 utf8_char. */
02132 
02133 
02134 /* Handle cases when the next item is a character. */
02135 
02136 if (next >= 0) switch(op_code)
02137   {
02138   case OP_CHAR:
02139 #ifdef SUPPORT_UTF8
02140   if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
02141 #else
02142   (void)(utf8_char);  /* Keep compiler happy by referencing function argument */
02143 #endif
02144   return item != next;
02145 
02146   /* For CHARNC (caseless character) we must check the other case. If we have
02147   Unicode property support, we can use it to test the other case of
02148   high-valued characters. */
02149 
02150   case OP_CHARNC:
02151 #ifdef SUPPORT_UTF8
02152   if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
02153 #endif
02154   if (item == next) return FALSE;
02155 #ifdef SUPPORT_UTF8
02156   if (utf8)
02157     {
02158     unsigned int othercase;
02159     if (next < 128) othercase = cd->fcc[next]; else
02160 #ifdef SUPPORT_UCP
02161     othercase = UCD_OTHERCASE((unsigned int)next);
02162 #else
02163     othercase = NOTACHAR;
02164 #endif
02165     return (unsigned int)item != othercase;
02166     }
02167   else
02168 #endif  /* SUPPORT_UTF8 */
02169   return (item != cd->fcc[next]);  /* Non-UTF-8 mode */
02170 
02171   /* For OP_NOT, "item" must be a single-byte character. */
02172 
02173   case OP_NOT:
02174   if (item == next) return TRUE;
02175   if ((options & PCRE_CASELESS) == 0) return FALSE;
02176 #ifdef SUPPORT_UTF8
02177   if (utf8)
02178     {
02179     unsigned int othercase;
02180     if (next < 128) othercase = cd->fcc[next]; else
02181 #ifdef SUPPORT_UCP
02182     othercase = UCD_OTHERCASE(next);
02183 #else
02184     othercase = NOTACHAR;
02185 #endif
02186     return (unsigned int)item == othercase;
02187     }
02188   else
02189 #endif  /* SUPPORT_UTF8 */
02190   return (item == cd->fcc[next]);  /* Non-UTF-8 mode */
02191 
02192   case OP_DIGIT:
02193   return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
02194 
02195   case OP_NOT_DIGIT:
02196   return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
02197 
02198   case OP_WHITESPACE:
02199   return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
02200 
02201   case OP_NOT_WHITESPACE:
02202   return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
02203 
02204   case OP_WORDCHAR:
02205   return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
02206 
02207   case OP_NOT_WORDCHAR:
02208   return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
02209 
02210   case OP_HSPACE:
02211   case OP_NOT_HSPACE:
02212   switch(next)
02213     {
02214     case 0x09:
02215     case 0x20:
02216     case 0xa0:
02217     case 0x1680:
02218     case 0x180e:
02219     case 0x2000:
02220     case 0x2001:
02221     case 0x2002:
02222     case 0x2003:
02223     case 0x2004:
02224     case 0x2005:
02225     case 0x2006:
02226     case 0x2007:
02227     case 0x2008:
02228     case 0x2009:
02229     case 0x200A:
02230     case 0x202f:
02231     case 0x205f:
02232     case 0x3000:
02233     return op_code != OP_HSPACE;
02234     default:
02235     return op_code == OP_HSPACE;
02236     }
02237 
02238   case OP_VSPACE:
02239   case OP_NOT_VSPACE:
02240   switch(next)
02241     {
02242     case 0x0a:
02243     case 0x0b:
02244     case 0x0c:
02245     case 0x0d:
02246     case 0x85:
02247     case 0x2028:
02248     case 0x2029:
02249     return op_code != OP_VSPACE;
02250     default:
02251     return op_code == OP_VSPACE;
02252     }
02253 
02254   default:
02255   return FALSE;
02256   }
02257 
02258 
02259 /* Handle the case when the next item is \d, \s, etc. */
02260 
02261 switch(op_code)
02262   {
02263   case OP_CHAR:
02264   case OP_CHARNC:
02265 #ifdef SUPPORT_UTF8
02266   if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
02267 #endif
02268   switch(-next)
02269     {
02270     case ESC_d:
02271     return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
02272 
02273     case ESC_D:
02274     return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
02275 
02276     case ESC_s:
02277     return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
02278 
02279     case ESC_S:
02280     return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
02281 
02282     case ESC_w:
02283     return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
02284 
02285     case ESC_W:
02286     return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
02287 
02288     case ESC_h:
02289     case ESC_H:
02290     switch(item)
02291       {
02292       case 0x09:
02293       case 0x20:
02294       case 0xa0:
02295       case 0x1680:
02296       case 0x180e:
02297       case 0x2000:
02298       case 0x2001:
02299       case 0x2002:
02300       case 0x2003:
02301       case 0x2004:
02302       case 0x2005:
02303       case 0x2006:
02304       case 0x2007:
02305       case 0x2008:
02306       case 0x2009:
02307       case 0x200A:
02308       case 0x202f:
02309       case 0x205f:
02310       case 0x3000:
02311       return -next != ESC_h;
02312       default:
02313       return -next == ESC_h;
02314       }
02315 
02316     case ESC_v:
02317     case ESC_V:
02318     switch(item)
02319       {
02320       case 0x0a:
02321       case 0x0b:
02322       case 0x0c:
02323       case 0x0d:
02324       case 0x85:
02325       case 0x2028:
02326       case 0x2029:
02327       return -next != ESC_v;
02328       default:
02329       return -next == ESC_v;
02330       }
02331 
02332     default:
02333     return FALSE;
02334     }
02335 
02336   case OP_DIGIT:
02337   return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
02338          next == -ESC_h || next == -ESC_v;
02339 
02340   case OP_NOT_DIGIT:
02341   return next == -ESC_d;
02342 
02343   case OP_WHITESPACE:
02344   return next == -ESC_S || next == -ESC_d || next == -ESC_w;
02345 
02346   case OP_NOT_WHITESPACE:
02347   return next == -ESC_s || next == -ESC_h || next == -ESC_v;
02348 
02349   case OP_HSPACE:
02350   return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
02351 
02352   case OP_NOT_HSPACE:
02353   return next == -ESC_h;
02354 
02355   /* Can't have \S in here because VT matches \S (Perl anomaly) */
02356   case OP_VSPACE:
02357   return next == -ESC_V || next == -ESC_d || next == -ESC_w;
02358 
02359   case OP_NOT_VSPACE:
02360   return next == -ESC_v;
02361 
02362   case OP_WORDCHAR:
02363   return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
02364 
02365   case OP_NOT_WORDCHAR:
02366   return next == -ESC_w || next == -ESC_d;
02367 
02368   default:
02369   return FALSE;
02370   }
02371 
02372 /* Control does not reach here */
02373 }
02374 
02375 
02376 
02377 /*************************************************
02378 *           Compile one branch                   *
02379 *************************************************/
02380 
02381 /* Scan the pattern, compiling it into the a vector. If the options are
02382 changed during the branch, the pointer is used to change the external options
02383 bits. This function is used during the pre-compile phase when we are trying
02384 to find out the amount of memory needed, as well as during the real compile
02385 phase. The value of lengthptr distinguishes the two phases.
02386 
02387 Arguments:
02388   optionsptr     pointer to the option bits
02389   codeptr        points to the pointer to the current code point
02390   ptrptr         points to the current pattern pointer
02391   errorcodeptr   points to error code variable
02392   firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
02393   reqbyteptr     set to the last literal character required, else < 0
02394   bcptr          points to current branch chain
02395   cd             contains pointers to tables etc.
02396   lengthptr      NULL during the real compile phase
02397                  points to length accumulator during pre-compile phase
02398 
02399 Returns:         TRUE on success
02400                  FALSE, with *errorcodeptr set non-zero on error
02401 */
02402 
02403 static BOOL
02404 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
02405   int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
02406   compile_data *cd, int *lengthptr)
02407 {
02408 int repeat_type, op_type;
02409 int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
02410 int bravalue = 0;
02411 int greedy_default, greedy_non_default;
02412 int firstbyte, reqbyte;
02413 int zeroreqbyte, zerofirstbyte;
02414 int req_caseopt, reqvary, tempreqvary;
02415 int options = *optionsptr;
02416 int after_manual_callout = 0;
02417 int length_prevgroup = 0;
02418 register int c;
02419 register uschar *code = *codeptr;
02420 uschar *last_code = code;
02421 uschar *orig_code = code;
02422 uschar *tempcode;
02423 BOOL inescq = FALSE;
02424 BOOL groupsetfirstbyte = FALSE;
02425 const uschar *ptr = *ptrptr;
02426 const uschar *tempptr;
02427 uschar *previous = NULL;
02428 uschar *previous_callout = NULL;
02429 uschar *save_hwm = NULL;
02430 uschar classbits[32];
02431 
02432 #ifdef SUPPORT_UTF8
02433 BOOL class_utf8;
02434 BOOL utf8 = (options & PCRE_UTF8) != 0;
02435 uschar *class_utf8data;
02436 uschar *class_utf8data_base;
02437 uschar utf8_char[6];
02438 #else
02439 BOOL utf8 = FALSE;
02440 uschar *utf8_char = NULL;
02441 #endif
02442 
02443 #ifdef DEBUG
02444 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
02445 #endif
02446 
02447 /* Set up the default and non-default settings for greediness */
02448 
02449 greedy_default = ((options & PCRE_UNGREEDY) != 0);
02450 greedy_non_default = greedy_default ^ 1;
02451 
02452 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
02453 matching encountered yet". It gets changed to REQ_NONE if we hit something that
02454 matches a non-fixed char first char; reqbyte just remains unset if we never
02455 find one.
02456 
02457 When we hit a repeat whose minimum is zero, we may have to adjust these values
02458 to take the zero repeat into account. This is implemented by setting them to
02459 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
02460 item types that can be repeated set these backoff variables appropriately. */
02461 
02462 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
02463 
02464 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
02465 according to the current setting of the caseless flag. REQ_CASELESS is a bit
02466 value > 255. It is added into the firstbyte or reqbyte variables to record the
02467 case status of the value. This is used only for ASCII characters. */
02468 
02469 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
02470 
02471 /* Switch on next character until the end of the branch */
02472 
02473 for (;; ptr++)
02474   {
02475   BOOL negate_class;
02476   BOOL should_flip_negation;
02477   BOOL possessive_quantifier;
02478   BOOL is_quantifier;
02479   BOOL is_recurse;
02480   BOOL reset_bracount;
02481   int class_charcount;
02482   int class_lastchar;
02483   int newoptions;
02484   int recno;
02485   int refsign;
02486   int skipbytes;
02487   int subreqbyte;
02488   int subfirstbyte;
02489   int terminator;
02490   int mclength;
02491   uschar mcbuffer[8];
02492 
02493   /* Get next byte in the pattern */
02494 
02495   c = *ptr;
02496 
02497   /* If we are in the pre-compile phase, accumulate the length used for the
02498   previous cycle of this loop. */
02499 
02500   if (lengthptr != NULL)
02501     {
02502 #ifdef DEBUG
02503     if (code > cd->hwm) cd->hwm = code;                 /* High water info */
02504 #endif
02505     if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
02506       {
02507       *errorcodeptr = ERR52;
02508       goto FAILED;
02509       }
02510 
02511     /* There is at least one situation where code goes backwards: this is the
02512     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
02513     the class is simply eliminated. However, it is created first, so we have to
02514     allow memory for it. Therefore, don't ever reduce the length at this point.
02515     */
02516 
02517     if (code < last_code) code = last_code;
02518 
02519     /* Paranoid check for integer overflow */
02520 
02521     if (OFLOW_MAX - *lengthptr < code - last_code)
02522       {
02523       *errorcodeptr = ERR20;
02524       goto FAILED;
02525       }
02526 
02527     *lengthptr += code - last_code;
02528     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
02529 
02530     /* If "previous" is set and it is not at the start of the work space, move
02531     it back to there, in order to avoid filling up the work space. Otherwise,
02532     if "previous" is NULL, reset the current code pointer to the start. */
02533 
02534     if (previous != NULL)
02535       {
02536       if (previous > orig_code)
02537         {
02538         memmove(orig_code, previous, code - previous);
02539         code -= previous - orig_code;
02540         previous = orig_code;
02541         }
02542       }
02543     else code = orig_code;
02544 
02545     /* Remember where this code item starts so we can pick up the length
02546     next time round. */
02547 
02548     last_code = code;
02549     }
02550 
02551   /* In the real compile phase, just check the workspace used by the forward
02552   reference list. */
02553 
02554   else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
02555     {
02556     *errorcodeptr = ERR52;
02557     goto FAILED;
02558     }
02559 
02560   /* If in \Q...\E, check for the end; if not, we have a literal */
02561 
02562   if (inescq && c != 0)
02563     {
02564     if (c == '\\' && ptr[1] == 'E')
02565       {
02566       inescq = FALSE;
02567       ptr++;
02568       continue;
02569       }
02570     else
02571       {
02572       if (previous_callout != NULL)
02573         {
02574         if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
02575           complete_callout(previous_callout, ptr, cd);
02576         previous_callout = NULL;
02577         }
02578       if ((options & PCRE_AUTO_CALLOUT) != 0)
02579         {
02580         previous_callout = code;
02581         code = auto_callout(code, ptr, cd);
02582         }
02583       goto NORMAL_CHAR;
02584       }
02585     }
02586 
02587   /* Fill in length of a previous callout, except when the next thing is
02588   a quantifier. */
02589 
02590   is_quantifier = c == '*' || c == '+' || c == '?' ||
02591     (c == '{' && is_counted_repeat(ptr+1));
02592 
02593   if (!is_quantifier && previous_callout != NULL &&
02594        after_manual_callout-- <= 0)
02595     {
02596     if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
02597       complete_callout(previous_callout, ptr, cd);
02598     previous_callout = NULL;
02599     }
02600 
02601   /* In extended mode, skip white space and comments */
02602 
02603   if ((options & PCRE_EXTENDED) != 0)
02604     {
02605     if ((cd->ctypes[c] & ctype_space) != 0) continue;
02606     if (c == '#')
02607       {
02608       while (*(++ptr) != 0)
02609         {
02610         if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
02611         }
02612       if (*ptr != 0) continue;
02613 
02614       /* Else fall through to handle end of string */
02615       c = 0;
02616       }
02617     }
02618 
02619   /* No auto callout for quantifiers. */
02620 
02621   if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
02622     {
02623     previous_callout = code;
02624     code = auto_callout(code, ptr, cd);
02625     }
02626 
02627   switch(c)
02628     {
02629     /* ===================================================================*/
02630     case 0:                        /* The branch terminates at string end */
02631     case '|':                      /* or | or ) */
02632     case ')':
02633     *firstbyteptr = firstbyte;
02634     *reqbyteptr = reqbyte;
02635     *codeptr = code;
02636     *ptrptr = ptr;
02637     if (lengthptr != NULL)
02638       {
02639       if (OFLOW_MAX - *lengthptr < code - last_code)
02640         {
02641         *errorcodeptr = ERR20;
02642         goto FAILED;
02643         }
02644       *lengthptr += code - last_code;   /* To include callout length */
02645       DPRINTF((">> end branch\n"));
02646       }
02647     return TRUE;
02648 
02649 
02650     /* ===================================================================*/
02651     /* Handle single-character metacharacters. In multiline mode, ^ disables
02652     the setting of any following char as a first character. */
02653 
02654     case '^':
02655     if ((options & PCRE_MULTILINE) != 0)
02656       {
02657       if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
02658       }
02659     previous = NULL;
02660     *code++ = OP_CIRC;
02661     break;
02662 
02663     case '$':
02664     previous = NULL;
02665     *code++ = OP_DOLL;
02666     break;
02667 
02668     /* There can never be a first char if '.' is first, whatever happens about
02669     repeats. The value of reqbyte doesn't change either. */
02670 
02671     case '.':
02672     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
02673     zerofirstbyte = firstbyte;
02674     zeroreqbyte = reqbyte;
02675     previous = code;
02676     *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
02677     break;
02678 
02679 
02680     /* ===================================================================*/
02681     /* Character classes. If the included characters are all < 256, we build a
02682     32-byte bitmap of the permitted characters, except in the special case
02683     where there is only one such character. For negated classes, we build the
02684     map as usual, then invert it at the end. However, we use a different opcode
02685     so that data characters > 255 can be handled correctly.
02686 
02687     If the class contains characters outside the 0-255 range, a different
02688     opcode is compiled. It may optionally have a bit map for characters < 256,
02689     but those above are are explicitly listed afterwards. A flag byte tells
02690     whether the bitmap is present, and whether this is a negated class or not.
02691 
02692     In JavaScript compatibility mode, an isolated ']' causes an error. In
02693     default (Perl) mode, it is treated as a data character. */
02694 
02695     case ']':
02696     if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
02697       {
02698       *errorcodeptr = ERR64;
02699       goto FAILED;
02700       }
02701     goto NORMAL_CHAR;
02702 
02703     case '[':
02704     previous = code;
02705 
02706     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
02707     they are encountered at the top level, so we'll do that too. */
02708 
02709     if ((ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
02710         check_posix_syntax(ptr, &tempptr))
02711       {
02712       *errorcodeptr = (ptr[1] == ':')? ERR13 : ERR31;
02713       goto FAILED;
02714       }
02715 
02716     /* If the first character is '^', set the negation flag and skip it. Also,
02717     if the first few characters (either before or after ^) are \Q\E or \E we
02718     skip them too. This makes for compatibility with Perl. */
02719 
02720     negate_class = FALSE;
02721     for (;;)
02722       {
02723       c = *(++ptr);
02724       if (c == '\\')
02725         {
02726         if (ptr[1] == 'E') ptr++;
02727           else if (strncmp((const char *)ptr+1, "Q\\E", 3) == 0) ptr += 3;
02728             else break;
02729         }
02730       else if (!negate_class && c == '^')
02731         negate_class = TRUE;
02732       else break;
02733       }
02734 
02735     /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
02736     an initial ']' is taken as a data character -- the code below handles
02737     that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
02738     [^] must match any character, so generate OP_ALLANY. */
02739 
02740     if (c ==']' && (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
02741       {
02742       *code++ = negate_class? OP_ALLANY : OP_FAIL;
02743       if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
02744       zerofirstbyte = firstbyte;
02745       break;
02746       }
02747 
02748     /* If a class contains a negative special such as \S, we need to flip the
02749     negation flag at the end, so that support for characters > 255 works
02750     correctly (they are all included in the class). */
02751 
02752     should_flip_negation = FALSE;
02753 
02754     /* Keep a count of chars with values < 256 so that we can optimize the case
02755     of just a single character (as long as it's < 256). However, For higher
02756     valued UTF-8 characters, we don't yet do any optimization. */
02757 
02758     class_charcount = 0;
02759     class_lastchar = -1;
02760 
02761     /* Initialize the 32-char bit map to all zeros. We build the map in a
02762     temporary bit of memory, in case the class contains only 1 character (less
02763     than 256), because in that case the compiled code doesn't use the bit map.
02764     */
02765 
02766     memset(classbits, 0, 32 * sizeof(uschar));
02767 
02768 #ifdef SUPPORT_UTF8
02769     class_utf8 = FALSE;                       /* No chars >= 256 */
02770     class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
02771     class_utf8data_base = class_utf8data;     /* For resetting in pass 1 */
02772 #endif
02773 
02774     /* Process characters until ] is reached. By writing this as a "do" it
02775     means that an initial ] is taken as a data character. At the start of the
02776     loop, c contains the first byte of the character. */
02777 
02778     if (c != 0) do
02779       {
02780       const uschar *oldptr;
02781 
02782 #ifdef SUPPORT_UTF8
02783       if (utf8 && c > 127)
02784         {                           /* Braces are required because the */
02785         GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
02786         }
02787 
02788       /* In the pre-compile phase, accumulate the length of any UTF-8 extra
02789       data and reset the pointer. This is so that very large classes that
02790       contain a zillion UTF-8 characters no longer overwrite the work space
02791       (which is on the stack). */
02792 
02793       if (lengthptr != NULL)
02794         {
02795         *lengthptr += class_utf8data - class_utf8data_base;
02796         class_utf8data = class_utf8data_base;
02797         }
02798 
02799 #endif
02800 
02801       /* Inside \Q...\E everything is literal except \E */
02802 
02803       if (inescq)
02804         {
02805         if (c == '\\' && ptr[1] == 'E')     /* If we are at \E */
02806           {
02807           inescq = FALSE;                   /* Reset literal state */
02808           ptr++;                            /* Skip the 'E' */
02809           continue;                         /* Carry on with next */
02810           }
02811         goto CHECK_RANGE;                   /* Could be range if \E follows */
02812         }
02813 
02814       /* Handle POSIX class names. Perl allows a negation extension of the
02815       form [:^name:]. A square bracket that doesn't match the syntax is
02816       treated as a literal. We also recognize the POSIX constructions
02817       [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
02818       5.6 and 5.8 do. */
02819 
02820       if (c == '[' &&
02821           (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
02822           check_posix_syntax(ptr, &tempptr))
02823         {
02824         BOOL local_negate = FALSE;
02825         int posix_class, taboffset, tabopt;
02826         register const uschar *cbits = cd->cbits;
02827         uschar pbits[32];
02828 
02829         if (ptr[1] != ':')
02830           {
02831           *errorcodeptr = ERR31;
02832           goto FAILED;
02833           }
02834 
02835         ptr += 2;
02836         if (*ptr == '^')
02837           {
02838           local_negate = TRUE;
02839           should_flip_negation = TRUE;  /* Note negative special */
02840           ptr++;
02841           }
02842 
02843         posix_class = check_posix_name(ptr, tempptr - ptr);
02844         if (posix_class < 0)
02845           {
02846           *errorcodeptr = ERR30;
02847           goto FAILED;
02848           }
02849 
02850         /* If matching is caseless, upper and lower are converted to
02851         alpha. This relies on the fact that the class table starts with
02852         alpha, lower, upper as the first 3 entries. */
02853 
02854         if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
02855           posix_class = 0;
02856 
02857         /* We build the bit map for the POSIX class in a chunk of local store
02858         because we may be adding and subtracting from it, and we don't want to
02859         subtract bits that may be in the main map already. At the end we or the
02860         result into the bit map that is being built. */
02861 
02862         posix_class *= 3;
02863 
02864         /* Copy in the first table (always present) */
02865 
02866         memcpy(pbits, cbits + posix_class_maps[posix_class],
02867           32 * sizeof(uschar));
02868 
02869         /* If there is a second table, add or remove it as required. */
02870 
02871         taboffset = posix_class_maps[posix_class + 1];
02872         tabopt = posix_class_maps[posix_class + 2];
02873 
02874         if (taboffset >= 0)
02875           {
02876           if (tabopt >= 0)
02877             for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
02878           else
02879             for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
02880           }
02881 
02882         /* Not see if we need to remove any special characters. An option
02883         value of 1 removes vertical space and 2 removes underscore. */
02884 
02885         if (tabopt < 0) tabopt = -tabopt;
02886         if (tabopt == 1) pbits[1] &= ~0x3c;
02887           else if (tabopt == 2) pbits[11] &= 0x7f;
02888 
02889         /* Add the POSIX table or its complement into the main table that is
02890         being built and we are done. */
02891 
02892         if (local_negate)
02893           for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
02894         else
02895           for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
02896 
02897         ptr = tempptr + 1;
02898         class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
02899         continue;    /* End of POSIX syntax handling */
02900         }
02901 
02902       /* Backslash may introduce a single character, or it may introduce one
02903       of the specials, which just set a flag. The sequence \b is a special
02904       case. Inside a class (and only there) it is treated as backspace.
02905       Elsewhere it marks a word boundary. Other escapes have preset maps ready
02906       to 'or' into the one we are building. We assume they have more than one
02907       character in them, so set class_charcount bigger than one. */
02908 
02909       if (c == '\\')
02910         {
02911         c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
02912         if (*errorcodeptr != 0) goto FAILED;
02913 
02914         if (-c == ESC_b) c = '\b';       /* \b is backspace in a class */
02915         else if (-c == ESC_X) c = 'X';   /* \X is literal X in a class */
02916         else if (-c == ESC_R) c = 'R';   /* \R is literal R in a class */
02917         else if (-c == ESC_Q)            /* Handle start of quoted string */
02918           {
02919           if (ptr[1] == '\\' && ptr[2] == 'E')
02920             {
02921             ptr += 2; /* avoid empty string */
02922             }
02923           else inescq = TRUE;
02924           continue;
02925           }
02926         else if (-c == ESC_E) continue;  /* Ignore orphan \E */
02927 
02928         if (c < 0)
02929           {
02930           register const uschar *cbits = cd->cbits;
02931           class_charcount += 2;     /* Greater than 1 is what matters */
02932 
02933           /* Save time by not doing this in the pre-compile phase. */
02934 
02935           if (lengthptr == NULL) switch (-c)
02936             {
02937             case ESC_d:
02938             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
02939             continue;
02940 
02941             case ESC_D:
02942             should_flip_negation = TRUE;
02943             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
02944             continue;
02945 
02946             case ESC_w:
02947             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
02948             continue;
02949 
02950             case ESC_W:
02951             should_flip_negation = TRUE;
02952             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
02953             continue;
02954 
02955             case ESC_s:
02956             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
02957             classbits[1] &= ~0x08;   /* Perl 5.004 onwards omits VT from \s */
02958             continue;
02959 
02960             case ESC_S:
02961             should_flip_negation = TRUE;
02962             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
02963             classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
02964             continue;
02965 
02966             default:    /* Not recognized; fall through */
02967             break;      /* Need "default" setting to stop compiler warning. */
02968             }
02969 
02970           /* In the pre-compile phase, just do the recognition. */
02971 
02972           else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
02973                    c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
02974 
02975           /* We need to deal with \H, \h, \V, and \v in both phases because
02976           they use extra memory. */
02977 
02978           if (-c == ESC_h)
02979             {
02980             SETBIT(classbits, 0x09); /* VT */
02981             SETBIT(classbits, 0x20); /* SPACE */
02982             SETBIT(classbits, 0xa0); /* NSBP */
02983 #ifdef SUPPORT_UTF8
02984             if (utf8)
02985               {
02986               class_utf8 = TRUE;
02987               *class_utf8data++ = XCL_SINGLE;
02988               class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
02989               *class_utf8data++ = XCL_SINGLE;
02990               class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
02991               *class_utf8data++ = XCL_RANGE;
02992               class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
02993               class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
02994               *class_utf8data++ = XCL_SINGLE;
02995               class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
02996               *class_utf8data++ = XCL_SINGLE;
02997               class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
02998               *class_utf8data++ = XCL_SINGLE;
02999               class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
03000               }
03001 #endif
03002             continue;
03003             }
03004 
03005           if (-c == ESC_H)
03006             {
03007             for (c = 0; c < 32; c++)
03008               {
03009               int x = 0xff;
03010               switch (c)
03011                 {
03012                 case 0x09/8: x ^= 1 << (0x09%8); break;
03013                 case 0x20/8: x ^= 1 << (0x20%8); break;
03014                 case 0xa0/8: x ^= 1 << (0xa0%8); break;
03015                 default: break;
03016                 }
03017               classbits[c] |= x;
03018               }
03019 
03020 #ifdef SUPPORT_UTF8
03021             if (utf8)
03022               {
03023               class_utf8 = TRUE;
03024               *class_utf8data++ = XCL_RANGE;
03025               class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
03026               class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
03027               *class_utf8data++ = XCL_RANGE;
03028               class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
03029               class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
03030               *class_utf8data++ = XCL_RANGE;
03031               class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
03032               class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
03033               *class_utf8data++ = XCL_RANGE;
03034               class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
03035               class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
03036               *class_utf8data++ = XCL_RANGE;
03037               class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
03038               class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
03039               *class_utf8data++ = XCL_RANGE;
03040               class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
03041               class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
03042               *class_utf8data++ = XCL_RANGE;
03043               class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
03044               class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
03045               }
03046 #endif
03047             continue;
03048             }
03049 
03050           if (-c == ESC_v)
03051             {
03052             SETBIT(classbits, 0x0a); /* LF */
03053             SETBIT(classbits, 0x0b); /* VT */
03054             SETBIT(classbits, 0x0c); /* FF */
03055             SETBIT(classbits, 0x0d); /* CR */
03056             SETBIT(classbits, 0x85); /* NEL */
03057 #ifdef SUPPORT_UTF8
03058             if (utf8)
03059               {
03060               class_utf8 = TRUE;
03061               *class_utf8data++ = XCL_RANGE;
03062               class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
03063               class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
03064               }
03065 #endif
03066             continue;
03067             }
03068 
03069           if (-c == ESC_V)
03070             {
03071             for (c = 0; c < 32; c++)
03072               {
03073               int x = 0xff;
03074               switch (c)
03075                 {
03076                 case 0x0a/8: x ^= 1 << (0x0a%8);
03077                              x ^= 1 << (0x0b%8);
03078                              x ^= 1 << (0x0c%8);
03079                              x ^= 1 << (0x0d%8);
03080                              break;
03081                 case 0x85/8: x ^= 1 << (0x85%8); break;
03082                 default: break;
03083                 }
03084               classbits[c] |= x;
03085               }
03086 
03087 #ifdef SUPPORT_UTF8
03088             if (utf8)
03089               {
03090               class_utf8 = TRUE;
03091               *class_utf8data++ = XCL_RANGE;
03092               class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
03093               class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
03094               *class_utf8data++ = XCL_RANGE;
03095               class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
03096               class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
03097               }
03098 #endif
03099             continue;
03100             }
03101 
03102           /* We need to deal with \P and \p in both phases. */
03103 
03104 #ifdef SUPPORT_UCP
03105           if (-c == ESC_p || -c == ESC_P)
03106             {
03107             BOOL negated;
03108             int pdata;
03109             int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
03110             if (ptype < 0) goto FAILED;
03111             class_utf8 = TRUE;
03112             *class_utf8data++ = ((-c == ESC_p) != negated)?
03113               XCL_PROP : XCL_NOTPROP;
03114             *class_utf8data++ = ptype;
03115             *class_utf8data++ = pdata;
03116             class_charcount -= 2;   /* Not a < 256 character */
03117             continue;
03118             }
03119 #endif
03120           /* Unrecognized escapes are faulted if PCRE is running in its
03121           strict mode. By default, for compatibility with Perl, they are
03122           treated as literals. */
03123 
03124           if ((options & PCRE_EXTRA) != 0)
03125             {
03126             *errorcodeptr = ERR7;
03127             goto FAILED;
03128             }
03129 
03130           class_charcount -= 2;  /* Undo the default count from above */
03131           c = *ptr;              /* Get the final character and fall through */
03132           }
03133 
03134         /* Fall through if we have a single character (c >= 0). This may be
03135         greater than 256 in UTF-8 mode. */
03136 
03137         }   /* End of backslash handling */
03138 
03139       /* A single character may be followed by '-' to form a range. However,
03140       Perl does not permit ']' to be the end of the range. A '-' character
03141       at the end is treated as a literal. Perl ignores orphaned \E sequences
03142       entirely. The code for handling \Q and \E is messy. */
03143 
03144       CHECK_RANGE:
03145       while (ptr[1] == '\\' && ptr[2] == 'E')
03146         {
03147         inescq = FALSE;
03148         ptr += 2;
03149         }
03150 
03151       oldptr = ptr;
03152 
03153       /* Remember \r or \n */
03154 
03155       if (c == '\r' || c == '\n') cd->external_flags |= PCRE_HASCRORLF;
03156 
03157       /* Check for range */
03158 
03159       if (!inescq && ptr[1] == '-')
03160         {
03161         int d;
03162         ptr += 2;
03163         while (*ptr == '\\' && ptr[1] == 'E') ptr += 2;
03164 
03165         /* If we hit \Q (not followed by \E) at this point, go into escaped
03166         mode. */
03167 
03168         while (*ptr == '\\' && ptr[1] == 'Q')
03169           {
03170           ptr += 2;
03171           if (*ptr == '\\' && ptr[1] == 'E') { ptr += 2; continue; }
03172           inescq = TRUE;
03173           break;
03174           }
03175 
03176         if (*ptr == 0 || (!inescq && *ptr == ']'))
03177           {
03178           ptr = oldptr;
03179           goto LONE_SINGLE_CHARACTER;
03180           }
03181 
03182 #ifdef SUPPORT_UTF8
03183         if (utf8)
03184           {                           /* Braces are required because the */
03185           GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
03186           }
03187         else
03188 #endif
03189         d = *ptr;  /* Not UTF-8 mode */
03190 
03191         /* The second part of a range can be a single-character escape, but
03192         not any of the other escapes. Perl 5.6 treats a hyphen as a literal
03193         in such circumstances. */
03194 
03195         if (!inescq && d == '\\')
03196           {
03197           d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
03198           if (*errorcodeptr != 0) goto FAILED;
03199 
03200           /* \b is backspace; \X is literal X; \R is literal R; any other
03201           special means the '-' was literal */
03202 
03203           if (d < 0)
03204             {
03205             if (d == -ESC_b) d = '\b';
03206             else if (d == -ESC_X) d = 'X';
03207             else if (d == -ESC_R) d = 'R'; else
03208               {
03209               ptr = oldptr;
03210               goto LONE_SINGLE_CHARACTER;  /* A few lines below */
03211               }
03212             }
03213           }
03214 
03215         /* Check that the two values are in the correct order. Optimize
03216         one-character ranges */
03217 
03218         if (d < c)
03219           {
03220           *errorcodeptr = ERR8;
03221           goto FAILED;
03222           }
03223 
03224         if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
03225 
03226         /* Remember \r or \n */
03227 
03228         if (d == '\r' || d == '\n') cd->external_flags |= PCRE_HASCRORLF;
03229 
03230         /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
03231         matching, we have to use an XCLASS with extra data items. Caseless
03232         matching for characters > 127 is available only if UCP support is
03233         available. */
03234 
03235 #ifdef SUPPORT_UTF8
03236         if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
03237           {
03238           class_utf8 = TRUE;
03239 
03240           /* With UCP support, we can find the other case equivalents of
03241           the relevant characters. There may be several ranges. Optimize how
03242           they fit with the basic range. */
03243 
03244 #ifdef SUPPORT_UCP
03245           if ((options & PCRE_CASELESS) != 0)
03246             {
03247             unsigned int occ, ocd;
03248             unsigned int cc = c;
03249             unsigned int origd = d;
03250             while (get_othercase_range(&cc, origd, &occ, &ocd))
03251               {
03252               if (occ >= (unsigned int)c &&
03253                   ocd <= (unsigned int)d)
03254                 continue;                          /* Skip embedded ranges */
03255 
03256               if (occ < (unsigned int)c  &&
03257                   ocd >= (unsigned int)c - 1)      /* Extend the basic range */
03258                 {                                  /* if there is overlap,   */
03259                 c = occ;                           /* noting that if occ < c */
03260                 continue;                          /* we can't have ocd > d  */
03261                 }                                  /* because a subrange is  */
03262               if (ocd > (unsigned int)d &&
03263                   occ <= (unsigned int)d + 1)      /* always shorter than    */
03264                 {                                  /* the basic range.       */
03265                 d = ocd;
03266                 continue;
03267                 }
03268 
03269               if (occ == ocd)
03270                 {
03271                 *class_utf8data++ = XCL_SINGLE;
03272                 }
03273               else
03274                 {
03275                 *class_utf8data++ = XCL_RANGE;
03276                 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
03277                 }
03278               class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
03279               }
03280             }
03281 #endif  /* SUPPORT_UCP */
03282 
03283           /* Now record the original range, possibly modified for UCP caseless
03284           overlapping ranges. */
03285 
03286           *class_utf8data++ = XCL_RANGE;
03287           class_utf8data += _pcre_ord2utf8(c, class_utf8data);
03288           class_utf8data += _pcre_ord2utf8(d, class_utf8data);
03289 
03290           /* With UCP support, we are done. Without UCP support, there is no
03291           caseless matching for UTF-8 characters > 127; we can use the bit map
03292           for the smaller ones. */
03293 
03294 #ifdef SUPPORT_UCP
03295           continue;    /* With next character in the class */
03296 #else
03297           if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
03298 
03299           /* Adjust upper limit and fall through to set up the map */
03300 
03301           d = 127;
03302 
03303 #endif  /* SUPPORT_UCP */
03304           }
03305 #endif  /* SUPPORT_UTF8 */
03306 
03307         /* We use the bit map for all cases when not in UTF-8 mode; else
03308         ranges that lie entirely within 0-127 when there is UCP support; else
03309         for partial ranges without UCP support. */
03310 
03311         class_charcount += d - c + 1;
03312         class_lastchar = d;
03313 
03314         /* We can save a bit of time by skipping this in the pre-compile. */
03315 
03316         if (lengthptr == NULL) for (; c <= d; c++)
03317           {
03318           classbits[c/8] |= (1 << (c&7));
03319           if ((options & PCRE_CASELESS) != 0)
03320             {
03321             int uc = cd->fcc[c];           /* flip case */
03322             classbits[uc/8] |= (1 << (uc&7));
03323             }
03324           }
03325 
03326         continue;   /* Go get the next char in the class */
03327         }
03328 
03329       /* Handle a lone single character - we can get here for a normal
03330       non-escape char, or after \ that introduces a single character or for an
03331       apparent range that isn't. */
03332 
03333       LONE_SINGLE_CHARACTER:
03334 
03335       /* Handle a character that cannot go in the bit map */
03336 
03337 #ifdef SUPPORT_UTF8
03338       if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
03339         {
03340         class_utf8 = TRUE;
03341         *class_utf8data++ = XCL_SINGLE;
03342         class_utf8data += _pcre_ord2utf8(c, class_utf8data);
03343 
03344 #ifdef SUPPORT_UCP
03345         if ((options & PCRE_CASELESS) != 0)
03346           {
03347           unsigned int othercase;
03348           if ((othercase = UCD_OTHERCASE(c)) != c)
03349             {
03350             *class_utf8data++ = XCL_SINGLE;
03351             class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
03352             }
03353           }
03354 #endif  /* SUPPORT_UCP */
03355 
03356         }
03357       else
03358 #endif  /* SUPPORT_UTF8 */
03359 
03360       /* Handle a single-byte character */
03361         {
03362         classbits[c/8] |= (1 << (c&7));
03363         if ((options & PCRE_CASELESS) != 0)
03364           {
03365           c = cd->fcc[c];   /* flip case */
03366           classbits[c/8] |= (1 << (c&7));
03367           }
03368         class_charcount++;
03369         class_lastchar = c;
03370         }
03371       }
03372 
03373     /* Loop until ']' reached. This "while" is the end of the "do" above. */
03374 
03375     while ((c = *(++ptr)) != 0 && (c != ']' || inescq));
03376 
03377     if (c == 0)                          /* Missing terminating ']' */
03378       {
03379       *errorcodeptr = ERR6;
03380       goto FAILED;
03381       }
03382 
03383 
03384 /* This code has been disabled because it would mean that \s counts as
03385 an explicit \r or \n reference, and that's not really what is wanted. Now
03386 we set the flag only if there is a literal "\r" or "\n" in the class. */
03387 
03388 #if 0
03389     /* Remember whether \r or \n are in this class */
03390 
03391     if (negate_class)
03392       {
03393       if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
03394       }
03395     else
03396       {
03397       if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
03398       }
03399 #endif
03400 
03401 
03402     /* If class_charcount is 1, we saw precisely one character whose value is
03403     less than 256. As long as there were no characters >= 128 and there was no
03404     use of \p or \P, in other words, no use of any XCLASS features, we can
03405     optimize.
03406 
03407     In UTF-8 mode, we can optimize the negative case only if there were no
03408     characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
03409     operate on single-bytes only. This is an historical hangover. Maybe one day
03410     we can tidy these opcodes to handle multi-byte characters.
03411 
03412     The optimization throws away the bit map. We turn the item into a
03413     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
03414     that OP_NOT does not support multibyte characters. In the positive case, it
03415     can cause firstbyte to be set. Otherwise, there can be no first char if
03416     this item is first, whatever repeat count may follow. In the case of
03417     reqbyte, save the previous value for reinstating. */
03418 
03419 #ifdef SUPPORT_UTF8
03420     if (class_charcount == 1 && !class_utf8 &&
03421       (!utf8 || !negate_class || class_lastchar < 128))
03422 #else
03423     if (class_charcount == 1)
03424 #endif
03425       {
03426       zeroreqbyte = reqbyte;
03427 
03428       /* The OP_NOT opcode works on one-byte characters only. */
03429 
03430       if (negate_class)
03431         {
03432         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
03433         zerofirstbyte = firstbyte;
03434         *code++ = OP_NOT;
03435         *code++ = class_lastchar;
03436         break;
03437         }
03438 
03439       /* For a single, positive character, get the value into mcbuffer, and
03440       then we can handle this with the normal one-character code. */
03441 
03442 #ifdef SUPPORT_UTF8
03443       if (utf8 && class_lastchar > 127)
03444         mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
03445       else
03446 #endif
03447         {
03448         mcbuffer[0] = class_lastchar;
03449         mclength = 1;
03450         }
03451       goto ONE_CHAR;
03452       }       /* End of 1-char optimization */
03453 
03454     /* The general case - not the one-char optimization. If this is the first
03455     thing in the branch, there can be no first char setting, whatever the
03456     repeat count. Any reqbyte setting must remain unchanged after any kind of
03457     repeat. */
03458 
03459     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
03460     zerofirstbyte = firstbyte;
03461     zeroreqbyte = reqbyte;
03462 
03463     /* If there are characters with values > 255, we have to compile an
03464     extended class, with its own opcode, unless there was a negated special
03465     such as \S in the class, because in that case all characters > 255 are in
03466     the class, so any that were explicitly given as well can be ignored. If
03467     (when there are explicit characters > 255 that must be listed) there are no
03468     characters < 256, we can omit the bitmap in the actual compiled code. */
03469 
03470 #ifdef SUPPORT_UTF8
03471     if (class_utf8 && !should_flip_negation)
03472       {
03473       *class_utf8data++ = XCL_END;    /* Marks the end of extra data */
03474       *code++ = OP_XCLASS;
03475       code += LINK_SIZE;
03476       *code = negate_class? XCL_NOT : 0;
03477 
03478       /* If the map is required, move up the extra data to make room for it;
03479       otherwise just move the code pointer to the end of the extra data. */
03480 
03481       if (class_charcount > 0)
03482         {
03483         *code++ |= XCL_MAP;
03484         memmove(code + 32, code, class_utf8data - code);
03485         memcpy(code, classbits, 32);
03486         code = class_utf8data + 32;
03487         }
03488       else code = class_utf8data;
03489 
03490       /* Now fill in the complete length of the item */
03491 
03492       PUT(previous, 1, code - previous);
03493       break;   /* End of class handling */
03494       }
03495 #endif
03496 
03497     /* If there are no characters > 255, set the opcode to OP_CLASS or
03498     OP_NCLASS, depending on whether the whole class was negated and whether
03499     there were negative specials such as \S in the class. Then copy the 32-byte
03500     map into the code vector, negating it if necessary. */
03501 
03502     *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
03503     if (negate_class)
03504       {
03505       if (lengthptr == NULL)    /* Save time in the pre-compile phase */
03506         for (c = 0; c < 32; c++) code[c] = ~classbits[c];
03507       }
03508     else
03509       {
03510       memcpy(code, classbits, 32);
03511       }
03512     code += 32;
03513     break;
03514 
03515 
03516     /* ===================================================================*/
03517     /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
03518     has been tested above. */
03519 
03520     case '{':
03521     if (!is_quantifier) goto NORMAL_CHAR;
03522     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
03523     if (*errorcodeptr != 0) goto FAILED;
03524     goto REPEAT;
03525 
03526     case '*':
03527     repeat_min = 0;
03528     repeat_max = -1;
03529     goto REPEAT;
03530 
03531     case '+':
03532     repeat_min = 1;
03533     repeat_max = -1;
03534     goto REPEAT;
03535 
03536     case '?':
03537     repeat_min = 0;
03538     repeat_max = 1;
03539 
03540     REPEAT:
03541     if (previous == NULL)
03542       {
03543       *errorcodeptr = ERR9;
03544       goto FAILED;
03545       }
03546 
03547     if (repeat_min == 0)
03548       {
03549       firstbyte = zerofirstbyte;    /* Adjust for zero repeat */
03550       reqbyte = zeroreqbyte;        /* Ditto */
03551       }
03552 
03553     /* Remember whether this is a variable length repeat */
03554 
03555     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
03556 
03557     op_type = 0;                    /* Default single-char op codes */
03558     possessive_quantifier = FALSE;  /* Default not possessive quantifier */
03559 
03560     /* Save start of previous item, in case we have to move it up to make space
03561     for an inserted OP_ONCE for the additional '+' extension. */
03562 
03563     tempcode = previous;
03564 
03565     /* If the next character is '+', we have a possessive quantifier. This
03566     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
03567     If the next character is '?' this is a minimizing repeat, by default,
03568     but if PCRE_UNGREEDY is set, it works the other way round. We change the
03569     repeat type to the non-default. */
03570 
03571     if (ptr[1] == '+')
03572       {
03573       repeat_type = 0;                  /* Force greedy */
03574       possessive_quantifier = TRUE;
03575       ptr++;
03576       }
03577     else if (ptr[1] == '?')
03578       {
03579       repeat_type = greedy_non_default;
03580       ptr++;
03581       }
03582     else repeat_type = greedy_default;
03583 
03584     /* If previous was a character match, abolish the item and generate a
03585     repeat item instead. If a char item has a minumum of more than one, ensure
03586     that it is set in reqbyte - it might not be if a sequence such as x{3} is
03587     the first thing in a branch because the x will have gone into firstbyte
03588     instead.  */
03589 
03590     if (*previous == OP_CHAR || *previous == OP_CHARNC)
03591       {
03592       /* Deal with UTF-8 characters that take up more than one byte. It's
03593       easier to write this out separately than try to macrify it. Use c to
03594       hold the length of the character in bytes, plus 0x80 to flag that it's a
03595       length rather than a small character. */
03596 
03597 #ifdef SUPPORT_UTF8
03598       if (utf8 && (code[-1] & 0x80) != 0)
03599         {
03600         uschar *lastchar = code - 1;
03601         while((*lastchar & 0xc0) == 0x80) lastchar--;
03602         c = code - lastchar;            /* Length of UTF-8 character */
03603         memcpy(utf8_char, lastchar, c); /* Save the char */
03604         c |= 0x80;                      /* Flag c as a length */
03605         }
03606       else
03607 #endif
03608 
03609       /* Handle the case of a single byte - either with no UTF8 support, or
03610       with UTF-8 disabled, or for a UTF-8 character < 128. */
03611 
03612         {
03613         c = code[-1];
03614         if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
03615         }
03616 
03617       /* If the repetition is unlimited, it pays to see if the next thing on
03618       the line is something that cannot possibly match this character. If so,
03619       automatically possessifying this item gains some performance in the case
03620       where the match fails. */
03621 
03622       if (!possessive_quantifier &&
03623           repeat_max < 0 &&
03624           check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
03625             options, cd))
03626         {
03627         repeat_type = 0;    /* Force greedy */
03628         possessive_quantifier = TRUE;
03629         }
03630 
03631       goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
03632       }
03633 
03634     /* If previous was a single negated character ([^a] or similar), we use
03635     one of the special opcodes, replacing it. The code is shared with single-
03636     character repeats by setting opt_type to add a suitable offset into
03637     repeat_type. We can also test for auto-possessification. OP_NOT is
03638     currently used only for single-byte chars. */
03639 
03640     else if (*previous == OP_NOT)
03641       {
03642       op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
03643       c = previous[1];
03644       if (!possessive_quantifier &&
03645           repeat_max < 0 &&
03646           check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
03647         {
03648         repeat_type = 0;    /* Force greedy */
03649         possessive_quantifier = TRUE;
03650         }
03651       goto OUTPUT_SINGLE_REPEAT;
03652       }
03653 
03654     /* If previous was a character type match (\d or similar), abolish it and
03655     create a suitable repeat item. The code is shared with single-character
03656     repeats by setting op_type to add a suitable offset into repeat_type. Note
03657     the the Unicode property types will be present only when SUPPORT_UCP is
03658     defined, but we don't wrap the little bits of code here because it just
03659     makes it horribly messy. */
03660 
03661     else if (*previous < OP_EODN)
03662       {
03663       uschar *oldcode;
03664       int prop_type, prop_value;
03665       op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
03666       c = *previous;
03667 
03668       if (!possessive_quantifier &&
03669           repeat_max < 0 &&
03670           check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
03671         {
03672         repeat_type = 0;    /* Force greedy */
03673         possessive_quantifier = TRUE;
03674         }
03675 
03676       OUTPUT_SINGLE_REPEAT:
03677       if (*previous == OP_PROP || *previous == OP_NOTPROP)
03678         {
03679         prop_type = previous[1];
03680         prop_value = previous[2];
03681         }
03682       else prop_type = prop_value = -1;
03683 
03684       oldcode = code;
03685       code = previous;                  /* Usually overwrite previous item */
03686 
03687       /* If the maximum is zero then the minimum must also be zero; Perl allows
03688       this case, so we do too - by simply omitting the item altogether. */
03689 
03690       if (repeat_max == 0) goto END_REPEAT;
03691 
03692       /* All real repeats make it impossible to handle partial matching (maybe
03693       one day we will be able to remove this restriction). */
03694 
03695       if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
03696 
03697       /* Combine the op_type with the repeat_type */
03698 
03699       repeat_type += op_type;
03700 
03701       /* A minimum of zero is handled either as the special case * or ?, or as
03702       an UPTO, with the maximum given. */
03703 
03704       if (repeat_min == 0)
03705         {
03706         if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
03707           else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
03708         else
03709           {
03710           *code++ = OP_UPTO + repeat_type;
03711           PUT2INC(code, 0, repeat_max);
03712           }
03713         }
03714 
03715       /* A repeat minimum of 1 is optimized into some special cases. If the
03716       maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
03717       left in place and, if the maximum is greater than 1, we use OP_UPTO with
03718       one less than the maximum. */
03719 
03720       else if (repeat_min == 1)
03721         {
03722         if (repeat_max == -1)
03723           *code++ = OP_PLUS + repeat_type;
03724         else
03725           {
03726           code = oldcode;                 /* leave previous item in place */
03727           if (repeat_max == 1) goto END_REPEAT;
03728           *code++ = OP_UPTO + repeat_type;
03729           PUT2INC(code, 0, repeat_max - 1);
03730           }
03731         }
03732 
03733       /* The case {n,n} is just an EXACT, while the general case {n,m} is
03734       handled as an EXACT followed by an UPTO. */
03735 
03736       else
03737         {
03738         *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
03739         PUT2INC(code, 0, repeat_min);
03740 
03741         /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
03742         we have to insert the character for the previous code. For a repeated
03743         Unicode property match, there are two extra bytes that define the
03744         required property. In UTF-8 mode, long characters have their length in
03745         c, with the 0x80 bit as a flag. */
03746 
03747         if (repeat_max < 0)
03748           {
03749 #ifdef SUPPORT_UTF8
03750           if (utf8 && c >= 128)
03751             {
03752             memcpy(code, utf8_char, c & 7);
03753             code += c & 7;
03754             }
03755           else
03756 #endif
03757             {
03758             *code++ = c;
03759             if (prop_type >= 0)
03760               {
03761               *code++ = prop_type;
03762               *code++ = prop_value;
03763               }
03764             }
03765           *code++ = OP_STAR + repeat_type;
03766           }
03767 
03768         /* Else insert an UPTO if the max is greater than the min, again
03769         preceded by the character, for the previously inserted code. If the
03770         UPTO is just for 1 instance, we can use QUERY instead. */
03771 
03772         else if (repeat_max != repeat_min)
03773           {
03774 #ifdef SUPPORT_UTF8
03775           if (utf8 && c >= 128)
03776             {
03777             memcpy(code, utf8_char, c & 7);
03778             code += c & 7;
03779             }
03780           else
03781 #endif
03782           *code++ = c;
03783           if (prop_type >= 0)
03784             {
03785             *code++ = prop_type;
03786             *code++ = prop_value;
03787             }
03788           repeat_max -= repeat_min;
03789 
03790           if (repeat_max == 1)
03791             {
03792             *code++ = OP_QUERY + repeat_type;
03793             }
03794           else
03795             {
03796             *code++ = OP_UPTO + repeat_type;
03797             PUT2INC(code, 0, repeat_max);
03798             }
03799           }
03800         }
03801 
03802       /* The character or character type itself comes last in all cases. */
03803 
03804 #ifdef SUPPORT_UTF8
03805       if (utf8 && c >= 128)
03806         {
03807         memcpy(code, utf8_char, c & 7);
03808         code += c & 7;
03809         }
03810       else
03811 #endif
03812       *code++ = c;
03813 
03814       /* For a repeated Unicode property match, there are two extra bytes that
03815       define the required property. */
03816 
03817 #ifdef SUPPORT_UCP
03818       if (prop_type >= 0)
03819         {
03820         *code++ = prop_type;
03821         *code++ = prop_value;
03822         }
03823 #endif
03824       }
03825 
03826     /* If previous was a character class or a back reference, we put the repeat
03827     stuff after it, but just skip the item if the repeat was {0,0}. */
03828 
03829     else if (*previous == OP_CLASS ||
03830              *previous == OP_NCLASS ||
03831 #ifdef SUPPORT_UTF8
03832              *previous == OP_XCLASS ||
03833 #endif
03834              *previous == OP_REF)
03835       {
03836       if (repeat_max == 0)
03837         {
03838         code = previous;
03839         goto END_REPEAT;
03840         }
03841 
03842       /* All real repeats make it impossible to handle partial matching (maybe
03843       one day we will be able to remove this restriction). */
03844 
03845       if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
03846 
03847       if (repeat_min == 0 && repeat_max == -1)
03848         *code++ = OP_CRSTAR + repeat_type;
03849       else if (repeat_min == 1 && repeat_max == -1)
03850         *code++ = OP_CRPLUS + repeat_type;
03851       else if (repeat_min == 0 && repeat_max == 1)
03852         *code++ = OP_CRQUERY + repeat_type;
03853       else
03854         {
03855         *code++ = OP_CRRANGE + repeat_type;
03856         PUT2INC(code, 0, repeat_min);
03857         if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
03858         PUT2INC(code, 0, repeat_max);
03859         }
03860       }
03861 
03862     /* If previous was a bracket group, we may have to replicate it in certain
03863     cases. */
03864 
03865     else if (*previous == OP_BRA  || *previous == OP_CBRA ||
03866              *previous == OP_ONCE || *previous == OP_COND)
03867       {
03868       register int i;
03869       int ketoffset = 0;
03870       int len = code - previous;
03871       uschar *bralink = NULL;
03872 
03873       /* Repeating a DEFINE group is pointless */
03874 
03875       if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
03876         {
03877         *errorcodeptr = ERR55;
03878         goto FAILED;
03879         }
03880 
03881       /* If the maximum repeat count is unlimited, find the end of the bracket
03882       by scanning through from the start, and compute the offset back to it
03883       from the current code pointer. There may be an OP_OPT setting following
03884       the final KET, so we can't find the end just by going back from the code
03885       pointer. */
03886 
03887       if (repeat_max == -1)
03888         {
03889         register uschar *ket = previous;
03890         do ket += GET(ket, 1); while (*ket != OP_KET);
03891         ketoffset = code - ket;
03892         }
03893 
03894       /* The case of a zero minimum is special because of the need to stick
03895       OP_BRAZERO in front of it, and because the group appears once in the
03896       data, whereas in other cases it appears the minimum number of times. For
03897       this reason, it is simplest to treat this case separately, as otherwise
03898       the code gets far too messy. There are several special subcases when the
03899       minimum is zero. */
03900 
03901       if (repeat_min == 0)
03902         {
03903         /* If the maximum is also zero, we used to just omit the group from the
03904         output altogether, like this:
03905 
03906         ** if (repeat_max == 0)
03907         **   {
03908         **   code = previous;
03909         **   goto END_REPEAT;
03910         **   }
03911 
03912         However, that fails when a group is referenced as a subroutine from
03913         elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it
03914         so that it is skipped on execution. As we don't have a list of which
03915         groups are referenced, we cannot do this selectively.
03916 
03917         If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
03918         and do no more at this point. However, we do need to adjust any
03919         OP_RECURSE calls inside the group that refer to the group itself or any
03920         internal or forward referenced group, because the offset is from the
03921         start of the whole regex. Temporarily terminate the pattern while doing
03922         this. */
03923 
03924         if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
03925           {
03926           *code = OP_END;
03927           adjust_recurse(previous, 1, utf8, cd, save_hwm);
03928           memmove(previous+1, previous, len);
03929           code++;
03930           if (repeat_max == 0)
03931             {
03932             *previous++ = OP_SKIPZERO;
03933             goto END_REPEAT;
03934             }
03935           *previous++ = OP_BRAZERO + repeat_type;
03936           }
03937 
03938         /* If the maximum is greater than 1 and limited, we have to replicate
03939         in a nested fashion, sticking OP_BRAZERO before each set of brackets.
03940         The first one has to be handled carefully because it's the original
03941         copy, which has to be moved up. The remainder can be handled by code
03942         that is common with the non-zero minimum case below. We have to
03943         adjust the value or repeat_max, since one less copy is required. Once
03944         again, we may have to adjust any OP_RECURSE calls inside the group. */
03945 
03946         else
03947           {
03948           int offset;
03949           *code = OP_END;
03950           adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
03951           memmove(previous + 2 + LINK_SIZE, previous, len);
03952           code += 2 + LINK_SIZE;
03953           *previous++ = OP_BRAZERO + repeat_type;
03954           *previous++ = OP_BRA;
03955 
03956           /* We chain together the bracket offset fields that have to be
03957           filled in later when the ends of the brackets are reached. */
03958 
03959           offset = (bralink == NULL)? 0 : previous - bralink;
03960           bralink = previous;
03961           PUTINC(previous, 0, offset);
03962           }
03963 
03964         repeat_max--;
03965         }
03966 
03967       /* If the minimum is greater than zero, replicate the group as many
03968       times as necessary, and adjust the maximum to the number of subsequent
03969       copies that we need. If we set a first char from the group, and didn't
03970       set a required char, copy the latter from the former. If there are any
03971       forward reference subroutine calls in the group, there will be entries on
03972       the workspace list; replicate these with an appropriate increment. */
03973 
03974       else
03975         {
03976         if (repeat_min > 1)
03977           {
03978           /* In the pre-compile phase, we don't actually do the replication. We
03979           just adjust the length as if we had. Do some paranoid checks for
03980           potential integer overflow. */
03981 
03982           if (lengthptr != NULL)
03983             {
03984             int delta = (repeat_min - 1)*length_prevgroup;
03985             if ((double)(repeat_min - 1)*(double)length_prevgroup >
03986                                                             (double)INT_MAX ||
03987                 OFLOW_MAX - *lengthptr < delta)
03988               {
03989               *errorcodeptr = ERR20;
03990               goto FAILED;
03991               }
03992             *lengthptr += delta;
03993             }
03994 
03995           /* This is compiling for real */
03996 
03997           else
03998             {
03999             if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
04000             for (i = 1; i < repeat_min; i++)
04001               {
04002               uschar *hc;
04003               uschar *this_hwm = cd->hwm;
04004               memcpy(code, previous, len);
04005               for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
04006                 {
04007                 PUT(cd->hwm, 0, GET(hc, 0) + len);
04008                 cd->hwm += LINK_SIZE;
04009                 }
04010               save_hwm = this_hwm;
04011               code += len;
04012               }
04013             }
04014           }
04015 
04016         if (repeat_max > 0) repeat_max -= repeat_min;
04017         }
04018 
04019       /* This code is common to both the zero and non-zero minimum cases. If
04020       the maximum is limited, it replicates the group in a nested fashion,
04021       remembering the bracket starts on a stack. In the case of a zero minimum,
04022       the first one was set up above. In all cases the repeat_max now specifies
04023       the number of additional copies needed. Again, we must remember to
04024       replicate entries on the forward reference list. */
04025 
04026       if (repeat_max >= 0)
04027         {
04028         /* In the pre-compile phase, we don't actually do the replication. We
04029         just adjust the length as if we had. For each repetition we must add 1
04030         to the length for BRAZERO and for all but the last repetition we must
04031         add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
04032         paranoid checks to avoid integer overflow. */
04033 
04034         if (lengthptr != NULL && repeat_max > 0)
04035           {
04036           int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
04037                       2 - 2*LINK_SIZE;   /* Last one doesn't nest */
04038           if ((double)repeat_max *
04039                 (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
04040                   > (double)INT_MAX ||
04041               OFLOW_MAX - *lengthptr < delta)
04042             {
04043             *errorcodeptr = ERR20;
04044             goto FAILED;
04045             }
04046           *lengthptr += delta;
04047           }
04048 
04049         /* This is compiling for real */
04050 
04051         else for (i = repeat_max - 1; i >= 0; i--)
04052           {
04053           uschar *hc;
04054           uschar *this_hwm = cd->hwm;
04055 
04056           *code++ = OP_BRAZERO + repeat_type;
04057 
04058           /* All but the final copy start a new nesting, maintaining the
04059           chain of brackets outstanding. */
04060 
04061           if (i != 0)
04062             {
04063             int offset;
04064             *code++ = OP_BRA;
04065             offset = (bralink == NULL)? 0 : code - bralink;
04066             bralink = code;
04067             PUTINC(code, 0, offset);
04068             }
04069 
04070           memcpy(code, previous, len);
04071           for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
04072             {
04073             PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
04074             cd->hwm += LINK_SIZE;
04075             }
04076           save_hwm = this_hwm;
04077           code += len;
04078           }
04079 
04080         /* Now chain through the pending brackets, and fill in their length
04081         fields (which are holding the chain links pro tem). */
04082 
04083         while (bralink != NULL)
04084           {
04085           int oldlinkoffset;
04086           int offset = code - bralink + 1;
04087           uschar *bra = code - offset;
04088           oldlinkoffset = GET(bra, 1);
04089           bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
04090           *code++ = OP_KET;
04091           PUTINC(code, 0, offset);
04092           PUT(bra, 1, offset);
04093           }
04094         }
04095 
04096       /* If the maximum is unlimited, set a repeater in the final copy. We
04097       can't just offset backwards from the current code point, because we
04098       don't know if there's been an options resetting after the ket. The
04099       correct offset was computed above.
04100 
04101       Then, when we are doing the actual compile phase, check to see whether
04102       this group is a non-atomic one that could match an empty string. If so,
04103       convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
04104       that runtime checking can be done. [This check is also applied to
04105       atomic groups at runtime, but in a different way.] */
04106 
04107       else
04108         {
04109         uschar *ketcode = code - ketoffset;
04110         uschar *bracode = ketcode - GET(ketcode, 1);
04111         *ketcode = OP_KETRMAX + repeat_type;
04112         if (lengthptr == NULL && *bracode != OP_ONCE)
04113           {
04114           uschar *scode = bracode;
04115           do
04116             {
04117             if (could_be_empty_branch(scode, ketcode, utf8))
04118               {
04119               *bracode += OP_SBRA - OP_BRA;
04120               break;
04121               }
04122             scode += GET(scode, 1);
04123             }
04124           while (*scode == OP_ALT);
04125           }
04126         }
04127       }
04128 
04129     /* If previous is OP_FAIL, it was generated by an empty class [] in
04130     JavaScript mode. The other ways in which OP_FAIL can be generated, that is
04131     by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
04132     error above. We can just ignore the repeat in JS case. */
04133 
04134     else if (*previous == OP_FAIL) goto END_REPEAT;
04135 
04136     /* Else there's some kind of shambles */
04137 
04138     else
04139       {
04140       *errorcodeptr = ERR11;
04141       goto FAILED;
04142       }
04143 
04144     /* If the character following a repeat is '+', or if certain optimization
04145     tests above succeeded, possessive_quantifier is TRUE. For some of the
04146     simpler opcodes, there is an special alternative opcode for this. For
04147     anything else, we wrap the entire repeated item inside OP_ONCE brackets.
04148     The '+' notation is just syntactic sugar, taken from Sun's Java package,
04149     but the special opcodes can optimize it a bit. The repeated item starts at
04150     tempcode, not at previous, which might be the first part of a string whose
04151     (former) last char we repeated.
04152 
04153     Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
04154     an 'upto' may follow. We skip over an 'exact' item, and then test the
04155     length of what remains before proceeding. */
04156 
04157     if (possessive_quantifier)
04158       {
04159       int len;
04160       if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
04161           *tempcode == OP_NOTEXACT)
04162         tempcode += _pcre_OP_lengths[*tempcode] +
04163           ((*tempcode == OP_TYPEEXACT &&
04164              (tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP))? 2:0);
04165       len = code - tempcode;
04166       if (len > 0) switch (*tempcode)
04167         {
04168         case OP_STAR:  *tempcode = OP_POSSTAR; break;
04169         case OP_PLUS:  *tempcode = OP_POSPLUS; break;
04170         case OP_QUERY: *tempcode = OP_POSQUERY; break;
04171         case OP_UPTO:  *tempcode = OP_POSUPTO; break;
04172 
04173         case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
04174         case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
04175         case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
04176         case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
04177 
04178         case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
04179         case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
04180         case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
04181         case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
04182 
04183         default:
04184         memmove(tempcode + 1+LINK_SIZE, tempcode, len);
04185         code += 1 + LINK_SIZE;
04186         len += 1 + LINK_SIZE;
04187         tempcode[0] = OP_ONCE;
04188         *code++ = OP_KET;
04189         PUTINC(code, 0, len);
04190         PUT(tempcode, 1, len);
04191         break;
04192         }
04193       }
04194 
04195     /* In all case we no longer have a previous item. We also set the
04196     "follows varying string" flag for subsequently encountered reqbytes if
04197     it isn't already set and we have just passed a varying length item. */
04198 
04199     END_REPEAT:
04200     previous = NULL;
04201     cd->req_varyopt |= reqvary;
04202     break;
04203 
04204 
04205     /* ===================================================================*/
04206     /* Start of nested parenthesized sub-expression, or comment or lookahead or
04207     lookbehind or option setting or condition or all the other extended
04208     parenthesis forms.  */
04209 
04210     case '(':
04211     newoptions = options;
04212     skipbytes = 0;
04213     bravalue = OP_CBRA;
04214     save_hwm = cd->hwm;
04215     reset_bracount = FALSE;
04216 
04217     /* First deal with various "verbs" that can be introduced by '*'. */
04218 
04219     if (*(++ptr) == '*' && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
04220       {
04221       int i, namelen;
04222       const char *vn = verbnames;
04223       const uschar *name = ++ptr;
04224       previous = NULL;
04225       while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {};
04226       if (*ptr == ':')
04227         {
04228         *errorcodeptr = ERR59;   /* Not supported */
04229         goto FAILED;
04230         }
04231       if (*ptr != ')')
04232         {
04233         *errorcodeptr = ERR60;
04234         goto FAILED;
04235         }
04236       namelen = ptr - name;
04237       for (i = 0; i < verbcount; i++)
04238         {
04239         if (namelen == verbs[i].len &&
04240             strncmp((char *)name, vn, namelen) == 0)
04241           {
04242           *code = verbs[i].op;
04243           if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
04244           break;
04245           }
04246         vn += verbs[i].len + 1;
04247         }
04248       if (i < verbcount) continue;
04249       *errorcodeptr = ERR60;
04250       goto FAILED;
04251       }
04252 
04253     /* Deal with the extended parentheses; all are introduced by '?', and the
04254     appearance of any of them means that this is not a capturing group. */
04255 
04256     else if (*ptr == '?')
04257       {
04258       int i, set, unset, namelen;
04259       int *optset;
04260       const uschar *name;
04261       uschar *slot;
04262 
04263       switch (*(++ptr))
04264         {
04265         case '#':                 /* Comment; skip to ket */
04266         ptr++;
04267         while (*ptr != 0 && *ptr != ')') ptr++;
04268         if (*ptr == 0)
04269           {
04270           *errorcodeptr = ERR18;
04271           goto FAILED;
04272           }
04273         continue;
04274 
04275 
04276         /* ------------------------------------------------------------ */
04277         case '|':                 /* Reset capture count for each branch */
04278         reset_bracount = TRUE;
04279         /* Fall through */
04280 
04281         /* ------------------------------------------------------------ */
04282         case ':':                 /* Non-capturing bracket */
04283         bravalue = OP_BRA;
04284         ptr++;
04285         break;
04286 
04287 
04288         /* ------------------------------------------------------------ */
04289         case '(':
04290         bravalue = OP_COND;       /* Conditional group */
04291 
04292         /* A condition can be an assertion, a number (referring to a numbered
04293         group), a name (referring to a named group), or 'R', referring to
04294         recursion. R<digits> and R&name are also permitted for recursion tests.
04295 
04296         There are several syntaxes for testing a named group: (?(name)) is used
04297         by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
04298 
04299         There are two unfortunate ambiguities, caused by history. (a) 'R' can
04300         be the recursive thing or the name 'R' (and similarly for 'R' followed
04301         by digits), and (b) a number could be a name that consists of digits.
04302         In both cases, we look for a name first; if not found, we try the other
04303         cases. */
04304 
04305         /* For conditions that are assertions, check the syntax, and then exit
04306         the switch. This will take control down to where bracketed groups,
04307         including assertions, are processed. */
04308 
04309         if (ptr[1] == '?' && (ptr[2] == '=' || ptr[2] == '!' || ptr[2] == '<'))
04310           break;
04311 
04312         /* Most other conditions use OP_CREF (a couple change to OP_RREF
04313         below), and all need to skip 3 bytes at the start of the group. */
04314 
04315         code[1+LINK_SIZE] = OP_CREF;
04316         skipbytes = 3;
04317         refsign = -1;
04318 
04319         /* Check for a test for recursion in a named group. */
04320 
04321         if (ptr[1] == 'R' && ptr[2] == '&')
04322           {
04323           terminator = -1;
04324           ptr += 2;
04325           code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
04326           }
04327 
04328         /* Check for a test for a named group's having been set, using the Perl
04329         syntax (?(<name>) or (?('name') */
04330 
04331         else if (ptr[1] == '<')
04332           {
04333           terminator = '>';
04334           ptr++;
04335           }
04336         else if (ptr[1] == '\'')
04337           {
04338           terminator = '\'';
04339           ptr++;
04340           }
04341         else
04342           {
04343           terminator = 0;
04344           if (ptr[1] == '-' || ptr[1] == '+') refsign = *(++ptr);
04345           }
04346 
04347         /* We now expect to read a name; any thing else is an error */
04348 
04349         if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
04350           {
04351           ptr += 1;  /* To get the right offset */
04352           *errorcodeptr = ERR28;
04353           goto FAILED;
04354           }
04355 
04356         /* Read the name, but also get it as a number if it's all digits */
04357 
04358         recno = 0;
04359         name = ++ptr;
04360         while ((cd->ctypes[*ptr] & ctype_word) != 0)
04361           {
04362           if (recno >= 0)
04363             recno = ((digitab[*ptr] & ctype_digit) != 0)?
04364               recno * 10 + *ptr - '0' : -1;
04365           ptr++;
04366           }
04367         namelen = ptr - name;
04368 
04369         if ((terminator > 0 && *ptr++ != terminator) || *ptr++ != ')')
04370           {
04371           ptr--;      /* Error offset */
04372           *errorcodeptr = ERR26;
04373           goto FAILED;
04374           }
04375 
04376         /* Do no further checking in the pre-compile phase. */
04377 
04378         if (lengthptr != NULL) break;
04379 
04380         /* In the real compile we do the work of looking for the actual
04381         reference. If the string started with "+" or "-" we require the rest to
04382         be digits, in which case recno will be set. */
04383 
04384         if (refsign > 0)
04385           {
04386           if (recno <= 0)
04387             {
04388             *errorcodeptr = ERR58;
04389             goto FAILED;
04390             }
04391           recno = (refsign == '-')?
04392             cd->bracount - recno + 1 : recno +cd->bracount;
04393           if (recno <= 0 || recno > cd->final_bracount)
04394             {
04395             *errorcodeptr = ERR15;
04396             goto FAILED;
04397             }
04398           PUT2(code, 2+LINK_SIZE, recno);
04399           break;
04400           }
04401 
04402         /* Otherwise (did not start with "+" or "-"), start by looking for the
04403         name. */
04404 
04405         slot = cd->name_table;
04406         for (i = 0; i < cd->names_found; i++)
04407           {
04408           if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
04409           slot += cd->name_entry_size;
04410           }
04411 
04412         /* Found a previous named subpattern */
04413 
04414         if (i < cd->names_found)
04415           {
04416           recno = GET2(slot, 0);
04417           PUT2(code, 2+LINK_SIZE, recno);
04418           }
04419 
04420         /* Search the pattern for a forward reference */
04421 
04422         else if ((i = find_parens(ptr, cd, name, namelen,
04423                         (options & PCRE_EXTENDED) != 0)) > 0)
04424           {
04425           PUT2(code, 2+LINK_SIZE, i);
04426           }
04427 
04428         /* If terminator == 0 it means that the name followed directly after
04429         the opening parenthesis [e.g. (?(abc)...] and in this case there are
04430         some further alternatives to try. For the cases where terminator != 0
04431         [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
04432         now checked all the possibilities, so give an error. */
04433 
04434         else if (terminator != 0)
04435           {
04436           *errorcodeptr = ERR15;
04437           goto FAILED;
04438           }
04439 
04440         /* Check for (?(R) for recursion. Allow digits after R to specify a
04441         specific group number. */
04442 
04443         else if (*name == 'R')
04444           {
04445           recno = 0;
04446           for (i = 1; i < namelen; i++)
04447             {
04448             if ((digitab[name[i]] & ctype_digit) == 0)
04449               {
04450               *errorcodeptr = ERR15;
04451               goto FAILED;
04452               }
04453             recno = recno * 10 + name[i] - '0';
04454             }
04455           if (recno == 0) recno = RREF_ANY;
04456           code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
04457           PUT2(code, 2+LINK_SIZE, recno);
04458           }
04459 
04460         /* Similarly, check for the (?(DEFINE) "condition", which is always
04461         false. */
04462 
04463         else if (namelen == 6 && strncmp((char *)name, "DEFINE", 6) == 0)
04464           {
04465           code[1+LINK_SIZE] = OP_DEF;
04466           skipbytes = 1;
04467           }
04468 
04469         /* Check for the "name" actually being a subpattern number. We are
04470         in the second pass here, so final_bracount is set. */
04471 
04472         else if (recno > 0 && recno <= cd->final_bracount)
04473           {
04474           PUT2(code, 2+LINK_SIZE, recno);
04475           }
04476 
04477         /* Either an unidentified subpattern, or a reference to (?(0) */
04478 
04479         else
04480           {
04481           *errorcodeptr = (recno == 0)? ERR35: ERR15;
04482           goto FAILED;
04483           }
04484         break;
04485 
04486 
04487         /* ------------------------------------------------------------ */
04488         case '=':                 /* Positive lookahead */
04489         bravalue = OP_ASSERT;
04490         ptr++;
04491         break;
04492 
04493 
04494         /* ------------------------------------------------------------ */
04495         case '!':                 /* Negative lookahead */
04496         ptr++;
04497         if (*ptr == ')')          /* Optimize (?!) */
04498           {
04499           *code++ = OP_FAIL;
04500           previous = NULL;
04501           continue;
04502           }
04503         bravalue = OP_ASSERT_NOT;
04504         break;
04505 
04506 
04507         /* ------------------------------------------------------------ */
04508         case '<':                 /* Lookbehind or named define */
04509         switch (ptr[1])
04510           {
04511           case '=':               /* Positive lookbehind */
04512           bravalue = OP_ASSERTBACK;
04513           ptr += 2;
04514           break;
04515 
04516           case '!':               /* Negative lookbehind */
04517           bravalue = OP_ASSERTBACK_NOT;
04518           ptr += 2;
04519           break;
04520 
04521           default:                /* Could be name define, else bad */
04522           if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
04523           ptr++;                  /* Correct offset for error */
04524           *errorcodeptr = ERR24;
04525           goto FAILED;
04526           }
04527         break;
04528 
04529 
04530         /* ------------------------------------------------------------ */
04531         case '>':                 /* One-time brackets */
04532         bravalue = OP_ONCE;
04533         ptr++;
04534         break;
04535 
04536 
04537         /* ------------------------------------------------------------ */
04538         case 'C':                 /* Callout - may be followed by digits; */
04539         previous_callout = code;  /* Save for later completion */
04540         after_manual_callout = 1; /* Skip one item before completing */
04541         *code++ = OP_CALLOUT;
04542           {
04543           int n = 0;
04544           while ((digitab[*(++ptr)] & ctype_digit) != 0)
04545             n = n * 10 + *ptr - '0';
04546           if (*ptr != ')')
04547             {
04548             *errorcodeptr = ERR39;
04549             goto FAILED;
04550             }
04551           if (n > 255)
04552             {
04553             *errorcodeptr = ERR38;
04554             goto FAILED;
04555             }
04556           *code++ = n;
04557           PUT(code, 0, ptr - cd->start_pattern + 1);  /* Pattern offset */
04558           PUT(code, LINK_SIZE, 0);                    /* Default length */
04559           code += 2 * LINK_SIZE;
04560           }
04561         previous = NULL;
04562         continue;
04563 
04564 
04565         /* ------------------------------------------------------------ */
04566         case 'P':                 /* Python-style named subpattern handling */
04567         if (*(++ptr) == '=' || *ptr == '>')  /* Reference or recursion */
04568           {
04569           is_recurse = *ptr == '>';
04570           terminator = ')';
04571           goto NAMED_REF_OR_RECURSE;
04572           }
04573         else if (*ptr != '<')    /* Test for Python-style definition */
04574           {
04575           *errorcodeptr = ERR41;
04576           goto FAILED;
04577           }
04578         /* Fall through to handle (?P< as (?< is handled */
04579 
04580 
04581         /* ------------------------------------------------------------ */
04582         DEFINE_NAME:    /* Come here from (?< handling */
04583         case '\'':
04584           {
04585           terminator = (*ptr == '<')? '>' : '\'';
04586           name = ++ptr;
04587 
04588           while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
04589           namelen = ptr - name;
04590 
04591           /* In the pre-compile phase, just do a syntax check. */
04592 
04593           if (lengthptr != NULL)
04594             {
04595             if (*ptr != terminator)
04596               {
04597               *errorcodeptr = ERR42;
04598               goto FAILED;
04599               }
04600             if (cd->names_found >= MAX_NAME_COUNT)
04601               {
04602               *errorcodeptr = ERR49;
04603               goto FAILED;
04604               }
04605             if (namelen + 3 > cd->name_entry_size)
04606               {
04607               cd->name_entry_size = namelen + 3;
04608               if (namelen > MAX_NAME_SIZE)
04609                 {
04610                 *errorcodeptr = ERR48;
04611                 goto FAILED;
04612                 }
04613               }
04614             }
04615 
04616           /* In the real compile, create the entry in the table */
04617 
04618           else
04619             {
04620             slot = cd->name_table;
04621             for (i = 0; i < cd->names_found; i++)
04622               {
04623               int crc = memcmp(name, slot+2, namelen);
04624               if (crc == 0)
04625                 {
04626                 if (slot[2+namelen] == 0)
04627                   {
04628                   if ((options & PCRE_DUPNAMES) == 0)
04629                     {
04630                     *errorcodeptr = ERR43;
04631                     goto FAILED;
04632                     }
04633                   }
04634                 else crc = -1;      /* Current name is substring */
04635                 }
04636               if (crc < 0)
04637                 {
04638                 memmove(slot + cd->name_entry_size, slot,
04639                   (cd->names_found - i) * cd->name_entry_size);
04640                 break;
04641                 }
04642               slot += cd->name_entry_size;
04643               }
04644 
04645             PUT2(slot, 0, cd->bracount + 1);
04646             memcpy(slot + 2, name, namelen);
04647             slot[2+namelen] = 0;
04648             }
04649           }
04650 
04651         /* In both cases, count the number of names we've encountered. */
04652 
04653         ptr++;                    /* Move past > or ' */
04654         cd->names_found++;
04655         goto NUMBERED_GROUP;
04656 
04657 
04658         /* ------------------------------------------------------------ */
04659         case '&':                 /* Perl recursion/subroutine syntax */
04660         terminator = ')';
04661         is_recurse = TRUE;
04662         /* Fall through */
04663 
04664         /* We come here from the Python syntax above that handles both
04665         references (?P=name) and recursion (?P>name), as well as falling
04666         through from the Perl recursion syntax (?&name). We also come here from
04667         the Perl \k<name> or \k'name' back reference syntax and the \k{name}
04668         .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
04669 
04670         NAMED_REF_OR_RECURSE:
04671         name = ++ptr;
04672         while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
04673         namelen = ptr - name;
04674 
04675         /* In the pre-compile phase, do a syntax check and set a dummy
04676         reference number. */
04677 
04678         if (lengthptr != NULL)
04679           {
04680           if (namelen == 0)
04681             {
04682             *errorcodeptr = ERR62;
04683             goto FAILED;
04684             }
04685           if (*ptr != terminator)
04686             {
04687             *errorcodeptr = ERR42;
04688             goto FAILED;
04689             }
04690           if (namelen > MAX_NAME_SIZE)
04691             {
04692             *errorcodeptr = ERR48;
04693             goto FAILED;
04694             }
04695           recno = 0;
04696           }
04697 
04698         /* In the real compile, seek the name in the table. We check the name
04699         first, and then check that we have reached the end of the name in the
04700         table. That way, if the name that is longer than any in the table,
04701         the comparison will fail without reading beyond the table entry. */
04702 
04703         else
04704           {
04705           slot = cd->name_table;
04706           for (i = 0; i < cd->names_found; i++)
04707             {
04708             if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&
04709                 slot[2+namelen] == 0)
04710               break;
04711             slot += cd->name_entry_size;
04712             }
04713 
04714           if (i < cd->names_found)         /* Back reference */
04715             {
04716             recno = GET2(slot, 0);
04717             }
04718           else if ((recno =                /* Forward back reference */
04719                     find_parens(ptr, cd, name, namelen,
04720                       (options & PCRE_EXTENDED) != 0)) <= 0)
04721             {
04722             *errorcodeptr = ERR15;
04723             goto FAILED;
04724             }
04725           }
04726 
04727         /* In both phases, we can now go to the code than handles numerical
04728         recursion or backreferences. */
04729 
04730         if (is_recurse) goto HANDLE_RECURSION;
04731           else goto HANDLE_REFERENCE;
04732 
04733 
04734         /* ------------------------------------------------------------ */
04735         case 'R':                 /* Recursion */
04736         ptr++;                    /* Same as (?0)      */
04737         /* Fall through */
04738 
04739 
04740         /* ------------------------------------------------------------ */
04741         case '-': case '+':
04742         case '0': case '1': case '2': case '3': case '4':   /* Recursion or */
04743         case '5': case '6': case '7': case '8': case '9':   /* subroutine */
04744           {
04745           const uschar *called;
04746           terminator = ')';
04747 
04748           /* Come here from the \g<...> and \g'...' code (Oniguruma
04749           compatibility). However, the syntax has been checked to ensure that
04750           the ... are a (signed) number, so that neither ERR63 nor ERR29 will
04751           be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
04752           ever be taken. */
04753 
04754           HANDLE_NUMERICAL_RECURSION:
04755 
04756           if ((refsign = *ptr) == '+')
04757             {
04758             ptr++;
04759             if ((digitab[*ptr] & ctype_digit) == 0)
04760               {
04761               *errorcodeptr = ERR63;
04762               goto FAILED;
04763               }
04764             }
04765           else if (refsign == '-')
04766             {
04767             if ((digitab[ptr[1]] & ctype_digit) == 0)
04768               goto OTHER_CHAR_AFTER_QUERY;
04769             ptr++;
04770             }
04771 
04772           recno = 0;
04773           while((digitab[*ptr] & ctype_digit) != 0)
04774             recno = recno * 10 + *ptr++ - '0';
04775 
04776           if (*ptr != terminator)
04777             {
04778             *errorcodeptr = ERR29;
04779             goto FAILED;
04780             }
04781 
04782           if (refsign == '-')
04783             {
04784             if (recno == 0)
04785               {
04786               *errorcodeptr = ERR58;
04787               goto FAILED;
04788               }
04789             recno = cd->bracount - recno + 1;
04790             if (recno <= 0)
04791               {
04792               *errorcodeptr = ERR15;
04793               goto FAILED;
04794               }
04795             }
04796           else if (refsign == '+')
04797             {
04798             if (recno == 0)
04799               {
04800               *errorcodeptr = ERR58;
04801               goto FAILED;
04802               }
04803             recno += cd->bracount;
04804             }
04805 
04806           /* Come here from code above that handles a named recursion */
04807 
04808           HANDLE_RECURSION:
04809 
04810           previous = code;
04811           called = cd->start_code;
04812 
04813           /* When we are actually compiling, find the bracket that is being
04814           referenced. Temporarily end the regex in case it doesn't exist before
04815           this point. If we end up with a forward reference, first check that
04816           the bracket does occur later so we can give the error (and position)
04817           now. Then remember this forward reference in the workspace so it can
04818           be filled in at the end. */
04819 
04820           if (lengthptr == NULL)
04821             {
04822             *code = OP_END;
04823             if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
04824 
04825             /* Forward reference */
04826 
04827             if (called == NULL)
04828               {
04829               if (find_parens(ptr, cd, NULL, recno,
04830                     (options & PCRE_EXTENDED) != 0) < 0)
04831                 {
04832                 *errorcodeptr = ERR15;
04833                 goto FAILED;
04834                 }
04835               called = cd->start_code + recno;
04836               PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
04837               }
04838 
04839             /* If not a forward reference, and the subpattern is still open,
04840             this is a recursive call. We check to see if this is a left
04841             recursion that could loop for ever, and diagnose that case. */
04842 
04843             else if (GET(called, 1) == 0 &&
04844                      could_be_empty(called, code, bcptr, utf8))
04845               {
04846               *errorcodeptr = ERR40;
04847               goto FAILED;
04848               }
04849             }
04850 
04851           /* Insert the recursion/subroutine item, automatically wrapped inside
04852           "once" brackets. Set up a "previous group" length so that a
04853           subsequent quantifier will work. */
04854 
04855           *code = OP_ONCE;
04856           PUT(code, 1, 2 + 2*LINK_SIZE);
04857           code += 1 + LINK_SIZE;
04858 
04859           *code = OP_RECURSE;
04860           PUT(code, 1, called - cd->start_code);
04861           code += 1 + LINK_SIZE;
04862 
04863           *code = OP_KET;
04864           PUT(code, 1, 2 + 2*LINK_SIZE);
04865           code += 1 + LINK_SIZE;
04866 
04867           length_prevgroup = 3 + 3*LINK_SIZE;
04868           }
04869 
04870         /* Can't determine a first byte now */
04871 
04872         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
04873         continue;
04874 
04875 
04876         /* ------------------------------------------------------------ */
04877         default:              /* Other characters: check option setting */
04878         OTHER_CHAR_AFTER_QUERY:
04879         set = unset = 0;
04880         optset = &set;
04881 
04882         while (*ptr != ')' && *ptr != ':')
04883           {
04884           switch (*ptr++)
04885             {
04886             case '-': optset = &unset; break;
04887 
04888             case 'J':    /* Record that it changed in the external options */
04889             *optset |= PCRE_DUPNAMES;
04890             cd->external_flags |= PCRE_JCHANGED;
04891             break;
04892 
04893             case 'i': *optset |= PCRE_CASELESS; break;
04894             case 'm': *optset |= PCRE_MULTILINE; break;
04895             case 's': *optset |= PCRE_DOTALL; break;
04896             case 'x': *optset |= PCRE_EXTENDED; break;
04897             case 'U': *optset |= PCRE_UNGREEDY; break;
04898             case 'X': *optset |= PCRE_EXTRA; break;
04899 
04900             default:  *errorcodeptr = ERR12;
04901                       ptr--;    /* Correct the offset */
04902                       goto FAILED;
04903             }
04904           }
04905 
04906         /* Set up the changed option bits, but don't change anything yet. */
04907 
04908         newoptions = (options | set) & (~unset);
04909 
04910         /* If the options ended with ')' this is not the start of a nested
04911         group with option changes, so the options change at this level. If this
04912         item is right at the start of the pattern, the options can be
04913         abstracted and made external in the pre-compile phase, and ignored in
04914         the compile phase. This can be helpful when matching -- for instance in
04915         caseless checking of required bytes.
04916 
04917         If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
04918         definitely *not* at the start of the pattern because something has been
04919         compiled. In the pre-compile phase, however, the code pointer can have
04920         that value after the start, because it gets reset as code is discarded
04921         during the pre-compile. However, this can happen only at top level - if
04922         we are within parentheses, the starting BRA will still be present. At
04923         any parenthesis level, the length value can be used to test if anything
04924         has been compiled at that level. Thus, a test for both these conditions
04925         is necessary to ensure we correctly detect the start of the pattern in
04926         both phases.
04927 
04928         If we are not at the pattern start, compile code to change the ims
04929         options if this setting actually changes any of them, and reset the
04930         greedy defaults and the case value for firstbyte and reqbyte. */
04931 
04932         if (*ptr == ')')
04933           {
04934           if (code == cd->start_code + 1 + LINK_SIZE &&
04935                (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
04936             {
04937             cd->external_options = newoptions;
04938             }
04939          else
04940             {
04941             if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
04942               {
04943               *code++ = OP_OPT;
04944               *code++ = newoptions & PCRE_IMS;
04945               }
04946             greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
04947             greedy_non_default = greedy_default ^ 1;
04948             req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
04949             }
04950 
04951           /* Change options at this level, and pass them back for use
04952           in subsequent branches. When not at the start of the pattern, this
04953           information is also necessary so that a resetting item can be
04954           compiled at the end of a group (if we are in a group). */
04955 
04956           *optionsptr = options = newoptions;
04957           previous = NULL;       /* This item can't be repeated */
04958           continue;              /* It is complete */
04959           }
04960 
04961         /* If the options ended with ':' we are heading into a nested group
04962         with possible change of options. Such groups are non-capturing and are
04963         not assertions of any kind. All we need to do is skip over the ':';
04964         the newoptions value is handled below. */
04965 
04966         bravalue = OP_BRA;
04967         ptr++;
04968         }     /* End of switch for character following (? */
04969       }       /* End of (? handling */
04970 
04971     /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
04972     all unadorned brackets become non-capturing and behave like (?:...)
04973     brackets. */
04974 
04975     else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
04976       {
04977       bravalue = OP_BRA;
04978       }
04979 
04980     /* Else we have a capturing group. */
04981 
04982     else
04983       {
04984       NUMBERED_GROUP:
04985       cd->bracount += 1;
04986       PUT2(code, 1+LINK_SIZE, cd->bracount);
04987       skipbytes = 2;
04988       }
04989 
04990     /* Process nested bracketed regex. Assertions may not be repeated, but
04991     other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
04992     non-register variable in order to be able to pass its address because some
04993     compilers complain otherwise. Pass in a new setting for the ims options if
04994     they have changed. */
04995 
04996     previous = (bravalue >= OP_ONCE)? code : NULL;
04997     *code = bravalue;
04998     tempcode = code;
04999     tempreqvary = cd->req_varyopt;     /* Save value before bracket */
05000     length_prevgroup = 0;              /* Initialize for pre-compile phase */
05001 
05002     if (!compile_regex(
05003          newoptions,                   /* The complete new option state */
05004          options & PCRE_IMS,           /* The previous ims option state */
05005          &tempcode,                    /* Where to put code (updated) */
05006          &ptr,                         /* Input pointer (updated) */
05007          errorcodeptr,                 /* Where to put an error message */
05008          (bravalue == OP_ASSERTBACK ||
05009           bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
05010          reset_bracount,               /* True if (?| group */
05011          skipbytes,                    /* Skip over bracket number */
05012          &subfirstbyte,                /* For possible first char */
05013          &subreqbyte,                  /* For possible last char */
05014          bcptr,                        /* Current branch chain */
05015          cd,                           /* Tables block */
05016          (lengthptr == NULL)? NULL :   /* Actual compile phase */
05017            &length_prevgroup           /* Pre-compile phase */
05018          ))
05019       goto FAILED;
05020 
05021     /* At the end of compiling, code is still pointing to the start of the
05022     group, while tempcode has been updated to point past the end of the group
05023     and any option resetting that may follow it. The pattern pointer (ptr)
05024     is on the bracket. */
05025 
05026     /* If this is a conditional bracket, check that there are no more than
05027     two branches in the group, or just one if it's a DEFINE group. We do this
05028     in the real compile phase, not in the pre-pass, where the whole group may
05029     not be available. */
05030 
05031     if (bravalue == OP_COND && lengthptr == NULL)
05032       {
05033       uschar *tc = code;
05034       int condcount = 0;
05035 
05036       do {
05037          condcount++;
05038          tc += GET(tc,1);
05039          }
05040       while (*tc != OP_KET);
05041 
05042       /* A DEFINE group is never obeyed inline (the "condition" is always
05043       false). It must have only one branch. */
05044 
05045       if (code[LINK_SIZE+1] == OP_DEF)
05046         {
05047         if (condcount > 1)
05048           {
05049           *errorcodeptr = ERR54;
05050           goto FAILED;
05051           }
05052         bravalue = OP_DEF;   /* Just a flag to suppress char handling below */
05053         }
05054 
05055       /* A "normal" conditional group. If there is just one branch, we must not
05056       make use of its firstbyte or reqbyte, because this is equivalent to an
05057       empty second branch. */
05058 
05059       else
05060         {
05061         if (condcount > 2)
05062           {
05063           *errorcodeptr = ERR27;
05064           goto FAILED;
05065           }
05066         if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
05067         }
05068       }
05069 
05070     /* Error if hit end of pattern */
05071 
05072     if (*ptr != ')')
05073       {
05074       *errorcodeptr = ERR14;
05075       goto FAILED;
05076       }
05077 
05078     /* In the pre-compile phase, update the length by the length of the group,
05079     less the brackets at either end. Then reduce the compiled code to just a
05080     set of non-capturing brackets so that it doesn't use much memory if it is
05081     duplicated by a quantifier.*/
05082 
05083     if (lengthptr != NULL)
05084       {
05085       if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
05086         {
05087         *errorcodeptr = ERR20;
05088         goto FAILED;
05089         }
05090       *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
05091       *code++ = OP_BRA;
05092       PUTINC(code, 0, 1 + LINK_SIZE);
05093       *code++ = OP_KET;
05094       PUTINC(code, 0, 1 + LINK_SIZE);
05095       break;    /* No need to waste time with special character handling */
05096       }
05097 
05098     /* Otherwise update the main code pointer to the end of the group. */
05099 
05100     code = tempcode;
05101 
05102     /* For a DEFINE group, required and first character settings are not
05103     relevant. */
05104 
05105     if (bravalue == OP_DEF) break;
05106 
05107     /* Handle updating of the required and first characters for other types of
05108     group. Update for normal brackets of all kinds, and conditions with two
05109     branches (see code above). If the bracket is followed by a quantifier with
05110     zero repeat, we have to back off. Hence the definition of zeroreqbyte and
05111     zerofirstbyte outside the main loop so that they can be accessed for the
05112     back off. */
05113 
05114     zeroreqbyte = reqbyte;
05115     zerofirstbyte = firstbyte;
05116     groupsetfirstbyte = FALSE;
05117 
05118     if (bravalue >= OP_ONCE)
05119       {
05120       /* If we have not yet set a firstbyte in this branch, take it from the
05121       subpattern, remembering that it was set here so that a repeat of more
05122       than one can replicate it as reqbyte if necessary. If the subpattern has
05123       no firstbyte, set "none" for the whole branch. In both cases, a zero
05124       repeat forces firstbyte to "none". */
05125 
05126       if (firstbyte == REQ_UNSET)
05127         {
05128         if (subfirstbyte >= 0)
05129           {
05130           firstbyte = subfirstbyte;
05131           groupsetfirstbyte = TRUE;
05132           }
05133         else firstbyte = REQ_NONE;
05134         zerofirstbyte = REQ_NONE;
05135         }
05136 
05137       /* If firstbyte was previously set, convert the subpattern's firstbyte
05138       into reqbyte if there wasn't one, using the vary flag that was in
05139       existence beforehand. */
05140 
05141       else if (subfirstbyte >= 0 && subreqbyte < 0)
05142         subreqbyte = subfirstbyte | tempreqvary;
05143 
05144       /* If the subpattern set a required byte (or set a first byte that isn't
05145       really the first byte - see above), set it. */
05146 
05147       if (subreqbyte >= 0) reqbyte = subreqbyte;
05148       }
05149 
05150     /* For a forward assertion, we take the reqbyte, if set. This can be
05151     helpful if the pattern that follows the assertion doesn't set a different
05152     char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
05153     for an assertion, however because it leads to incorrect effect for patterns
05154     such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
05155     of a firstbyte. This is overcome by a scan at the end if there's no
05156     firstbyte, looking for an asserted first char. */
05157 
05158     else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
05159     break;     /* End of processing '(' */
05160 
05161 
05162     /* ===================================================================*/
05163     /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
05164     are arranged to be the negation of the corresponding OP_values. For the
05165     back references, the values are ESC_REF plus the reference number. Only
05166     back references and those types that consume a character may be repeated.
05167     We can test for values between ESC_b and ESC_Z for the latter; this may
05168     have to change if any new ones are ever created. */
05169 
05170     case '\\':
05171     tempptr = ptr;
05172     c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
05173     if (*errorcodeptr != 0) goto FAILED;
05174 
05175     if (c < 0)
05176       {
05177       if (-c == ESC_Q)            /* Handle start of quoted string */
05178         {
05179         if (ptr[1] == '\\' && ptr[2] == 'E') ptr += 2; /* avoid empty string */
05180           else inescq = TRUE;
05181         continue;
05182         }
05183 
05184       if (-c == ESC_E) continue;  /* Perl ignores an orphan \E */
05185 
05186       /* For metasequences that actually match a character, we disable the
05187       setting of a first character if it hasn't already been set. */
05188 
05189       if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
05190         firstbyte = REQ_NONE;
05191 
05192       /* Set values to reset to if this is followed by a zero repeat. */
05193 
05194       zerofirstbyte = firstbyte;
05195       zeroreqbyte = reqbyte;
05196 
05197       /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
05198       is a subroutine call by number (Oniguruma syntax). In fact, the value
05199       -ESC_g is returned only for these cases. So we don't need to check for <
05200       or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is
05201       -ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as
05202       that is a synonym for a named back reference). */
05203 
05204       if (-c == ESC_g)
05205         {
05206         const uschar *p;
05207         save_hwm = cd->hwm;   /* Normally this is set when '(' is read */
05208         terminator = (*(++ptr) == '<')? '>' : '\'';
05209 
05210         /* These two statements stop the compiler for warning about possibly
05211         unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
05212         fact, because we actually check for a number below, the paths that
05213         would actually be in error are never taken. */
05214 
05215         skipbytes = 0;
05216         reset_bracount = FALSE;
05217 
05218         /* Test for a name */
05219 
05220         if (ptr[1] != '+' && ptr[1] != '-')
05221           {
05222           BOOL isnumber = TRUE;
05223           for (p = ptr + 1; *p != 0 && *p != terminator; p++)
05224             {
05225             if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE;
05226             if ((cd->ctypes[*p] & ctype_word) == 0) break;
05227             }
05228           if (*p != terminator)
05229             {
05230             *errorcodeptr = ERR57;
05231             break;
05232             }
05233           if (isnumber)
05234             {
05235             ptr++;
05236             goto HANDLE_NUMERICAL_RECURSION;
05237             }
05238           is_recurse = TRUE;
05239           goto NAMED_REF_OR_RECURSE;
05240           }
05241 
05242         /* Test a signed number in angle brackets or quotes. */
05243 
05244         p = ptr + 2;
05245         while ((digitab[*p] & ctype_digit) != 0) p++;
05246         if (*p != terminator)
05247           {
05248           *errorcodeptr = ERR57;
05249           break;
05250           }
05251         ptr++;
05252         goto HANDLE_NUMERICAL_RECURSION;
05253         }
05254 
05255       /* \k<name> or \k'name' is a back reference by name (Perl syntax).
05256       We also support \k{name} (.NET syntax) */
05257 
05258       if (-c == ESC_k && (ptr[1] == '<' || ptr[1] == '\'' || ptr[1] == '{'))
05259         {
05260         is_recurse = FALSE;
05261         terminator = (*(++ptr) == '<')? '>' : (*ptr == '\'')? '\'' : '}';
05262         goto NAMED_REF_OR_RECURSE;
05263         }
05264 
05265       /* Back references are handled specially; must disable firstbyte if
05266       not set to cope with cases like (?=(\w+))\1: which would otherwise set
05267       ':' later. */
05268 
05269       if (-c >= ESC_REF)
05270         {
05271         recno = -c - ESC_REF;
05272 
05273         HANDLE_REFERENCE:    /* Come here from named backref handling */
05274         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
05275         previous = code;
05276         *code++ = OP_REF;
05277         PUT2INC(code, 0, recno);
05278         cd->backref_map |= (recno < 32)? (1 << recno) : 1;
05279         if (recno > cd->top_backref) cd->top_backref = recno;
05280         }
05281 
05282       /* So are Unicode property matches, if supported. */
05283 
05284 #ifdef SUPPORT_UCP
05285       else if (-c == ESC_P || -c == ESC_p)
05286         {
05287         BOOL negated;
05288         int pdata;
05289         int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
05290         if (ptype < 0) goto FAILED;
05291         previous = code;
05292         *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
05293         *code++ = ptype;
05294         *code++ = pdata;
05295         }
05296 #else
05297 
05298       /* If Unicode properties are not supported, \X, \P, and \p are not
05299       allowed. */
05300 
05301       else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
05302         {
05303         *errorcodeptr = ERR45;
05304         goto FAILED;
05305         }
05306 #endif
05307 
05308       /* For the rest (including \X when Unicode properties are supported), we
05309       can obtain the OP value by negating the escape value. */
05310 
05311       else
05312         {
05313         previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
05314         *code++ = -c;
05315         }
05316       continue;
05317       }
05318 
05319     /* We have a data character whose value is in c. In UTF-8 mode it may have
05320     a value > 127. We set its representation in the length/buffer, and then
05321     handle it as a data character. */
05322 
05323 #ifdef SUPPORT_UTF8
05324     if (utf8 && c > 127)
05325       mclength = _pcre_ord2utf8(c, mcbuffer);
05326     else
05327 #endif
05328 
05329      {
05330      mcbuffer[0] = c;
05331      mclength = 1;
05332      }
05333     goto ONE_CHAR;
05334 
05335 
05336     /* ===================================================================*/
05337     /* Handle a literal character. It is guaranteed not to be whitespace or #
05338     when the extended flag is set. If we are in UTF-8 mode, it may be a
05339     multi-byte literal character. */
05340 
05341     default:
05342     NORMAL_CHAR:
05343     mclength = 1;
05344     mcbuffer[0] = c;
05345 
05346 #ifdef SUPPORT_UTF8
05347     if (utf8 && c >= 0xc0)
05348       {
05349       while ((ptr[1] & 0xc0) == 0x80)
05350         mcbuffer[mclength++] = *(++ptr);
05351       }
05352 #endif
05353 
05354     /* At this point we have the character's bytes in mcbuffer, and the length
05355     in mclength. When not in UTF-8 mode, the length is always 1. */
05356 
05357     ONE_CHAR:
05358     previous = code;
05359     *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
05360     for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
05361 
05362     /* Remember if \r or \n were seen */
05363 
05364     if (mcbuffer[0] == '\r' || mcbuffer[0] == '\n')
05365       cd->external_flags |= PCRE_HASCRORLF;
05366 
05367     /* Set the first and required bytes appropriately. If no previous first
05368     byte, set it from this character, but revert to none on a zero repeat.
05369     Otherwise, leave the firstbyte value alone, and don't change it on a zero
05370     repeat. */
05371 
05372     if (firstbyte == REQ_UNSET)
05373       {
05374       zerofirstbyte = REQ_NONE;
05375       zeroreqbyte = reqbyte;
05376 
05377       /* If the character is more than one byte long, we can set firstbyte
05378       only if it is not to be matched caselessly. */
05379 
05380       if (mclength == 1 || req_caseopt == 0)
05381         {
05382         firstbyte = mcbuffer[0] | req_caseopt;
05383         if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
05384         }
05385       else firstbyte = reqbyte = REQ_NONE;
05386       }
05387 
05388     /* firstbyte was previously set; we can set reqbyte only the length is
05389     1 or the matching is caseful. */
05390 
05391     else
05392       {
05393       zerofirstbyte = firstbyte;
05394       zeroreqbyte = reqbyte;
05395       if (mclength == 1 || req_caseopt == 0)
05396         reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
05397       }
05398 
05399     break;            /* End of literal character handling */
05400     }
05401   }                   /* end of big loop */
05402 
05403 
05404 /* Control never reaches here by falling through, only by a goto for all the
05405 error states. Pass back the position in the pattern so that it can be displayed
05406 to the user for diagnosing the error. */
05407 
05408 FAILED:
05409 *ptrptr = ptr;
05410 return FALSE;
05411 }
05412 
05413 
05414 
05415 
05416 /*************************************************
05417 *     Compile sequence of alternatives           *
05418 *************************************************/
05419 
05420 /* On entry, ptr is pointing past the bracket character, but on return it
05421 points to the closing bracket, or vertical bar, or end of string. The code
05422 variable is pointing at the byte into which the BRA operator has been stored.
05423 If the ims options are changed at the start (for a (?ims: group) or during any
05424 branch, we need to insert an OP_OPT item at the start of every following branch
05425 to ensure they get set correctly at run time, and also pass the new options
05426 into every subsequent branch compile.
05427 
05428 This function is used during the pre-compile phase when we are trying to find
05429 out the amount of memory needed, as well as during the real compile phase. The
05430 value of lengthptr distinguishes the two phases.
05431 
05432 Arguments:
05433   options        option bits, including any changes for this subpattern
05434   oldims         previous settings of ims option bits
05435   codeptr        -> the address of the current code pointer
05436   ptrptr         -> the address of the current pattern pointer
05437   errorcodeptr   -> pointer to error code variable
05438   lookbehind     TRUE if this is a lookbehind assertion
05439   reset_bracount TRUE to reset the count for each branch
05440   skipbytes      skip this many bytes at start (for brackets and OP_COND)
05441   firstbyteptr   place to put the first required character, or a negative number
05442   reqbyteptr     place to put the last required character, or a negative number
05443   bcptr          pointer to the chain of currently open branches
05444   cd             points to the data block with tables pointers etc.
05445   lengthptr      NULL during the real compile phase
05446                  points to length accumulator during pre-compile phase
05447 
05448 Returns:         TRUE on success
05449 */
05450 
05451 static BOOL
05452 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
05453   int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
05454   int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
05455   int *lengthptr)
05456 {
05457 const uschar *ptr = *ptrptr;
05458 uschar *code = *codeptr;
05459 uschar *last_branch = code;
05460 uschar *start_bracket = code;
05461 uschar *reverse_count = NULL;
05462 int firstbyte, reqbyte;
05463 int branchfirstbyte, branchreqbyte;
05464 int length;
05465 int orig_bracount;
05466 int max_bracount;
05467 branch_chain bc;
05468 
05469 bc.outer = bcptr;
05470 bc.current = code;
05471 
05472 firstbyte = reqbyte = REQ_UNSET;
05473 
05474 /* Accumulate the length for use in the pre-compile phase. Start with the
05475 length of the BRA and KET and any extra bytes that are required at the
05476 beginning. We accumulate in a local variable to save frequent testing of
05477 lenthptr for NULL. We cannot do this by looking at the value of code at the
05478 start and end of each alternative, because compiled items are discarded during
05479 the pre-compile phase so that the work space is not exceeded. */
05480 
05481 length = 2 + 2*LINK_SIZE + skipbytes;
05482 
05483 /* WARNING: If the above line is changed for any reason, you must also change
05484 the code that abstracts option settings at the start of the pattern and makes
05485 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
05486 pre-compile phase to find out whether anything has yet been compiled or not. */
05487 
05488 /* Offset is set zero to mark that this bracket is still open */
05489 
05490 PUT(code, 1, 0);
05491 code += 1 + LINK_SIZE + skipbytes;
05492 
05493 /* Loop for each alternative branch */
05494 
05495 orig_bracount = max_bracount = cd->bracount;
05496 for (;;)
05497   {
05498   /* For a (?| group, reset the capturing bracket count so that each branch
05499   uses the same numbers. */
05500 
05501   if (reset_bracount) cd->bracount = orig_bracount;
05502 
05503   /* Handle a change of ims options at the start of the branch */
05504 
05505   if ((options & PCRE_IMS) != oldims)
05506     {
05507     *code++ = OP_OPT;
05508     *code++ = options & PCRE_IMS;
05509     length += 2;
05510     }
05511 
05512   /* Set up dummy OP_REVERSE if lookbehind assertion */
05513 
05514   if (lookbehind)
05515     {
05516     *code++ = OP_REVERSE;
05517     reverse_count = code;
05518     PUTINC(code, 0, 0);
05519     length += 1 + LINK_SIZE;
05520     }
05521 
05522   /* Now compile the branch; in the pre-compile phase its length gets added
05523   into the length. */
05524 
05525   if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
05526         &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
05527     {
05528     *ptrptr = ptr;
05529     return FALSE;
05530     }
05531 
05532   /* Keep the highest bracket count in case (?| was used and some branch
05533   has fewer than the rest. */
05534 
05535   if (cd->bracount > max_bracount) max_bracount = cd->bracount;
05536 
05537   /* In the real compile phase, there is some post-processing to be done. */
05538 
05539   if (lengthptr == NULL)
05540     {
05541     /* If this is the first branch, the firstbyte and reqbyte values for the
05542     branch become the values for the regex. */
05543 
05544     if (*last_branch != OP_ALT)
05545       {
05546       firstbyte = branchfirstbyte;
05547       reqbyte = branchreqbyte;
05548       }
05549 
05550     /* If this is not the first branch, the first char and reqbyte have to
05551     match the values from all the previous branches, except that if the
05552     previous value for reqbyte didn't have REQ_VARY set, it can still match,
05553     and we set REQ_VARY for the regex. */
05554 
05555     else
05556       {
05557       /* If we previously had a firstbyte, but it doesn't match the new branch,
05558       we have to abandon the firstbyte for the regex, but if there was
05559       previously no reqbyte, it takes on the value of the old firstbyte. */
05560 
05561       if (firstbyte >= 0 && firstbyte != branchfirstbyte)
05562         {
05563         if (reqbyte < 0) reqbyte = firstbyte;
05564         firstbyte = REQ_NONE;
05565         }
05566 
05567       /* If we (now or from before) have no firstbyte, a firstbyte from the
05568       branch becomes a reqbyte if there isn't a branch reqbyte. */
05569 
05570       if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
05571           branchreqbyte = branchfirstbyte;
05572 
05573       /* Now ensure that the reqbytes match */
05574 
05575       if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
05576         reqbyte = REQ_NONE;
05577       else reqbyte |= branchreqbyte;   /* To "or" REQ_VARY */
05578       }
05579 
05580     /* If lookbehind, check that this branch matches a fixed-length string, and
05581     put the length into the OP_REVERSE item. Temporarily mark the end of the
05582     branch with OP_END. */
05583 
05584     if (lookbehind)
05585       {
05586       int fixed_length;
05587       *code = OP_END;
05588       fixed_length = find_fixedlength(last_branch, options);
05589       DPRINTF(("fixed length = %d\n", fixed_length));
05590       if (fixed_length < 0)
05591         {
05592         *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
05593         *ptrptr = ptr;
05594         return FALSE;
05595         }
05596       PUT(reverse_count, 0, fixed_length);
05597       }
05598     }
05599 
05600   /* Reached end of expression, either ')' or end of pattern. In the real
05601   compile phase, go back through the alternative branches and reverse the chain
05602   of offsets, with the field in the BRA item now becoming an offset to the
05603   first alternative. If there are no alternatives, it points to the end of the
05604   group. The length in the terminating ket is always the length of the whole
05605   bracketed item. If any of the ims options were changed inside the group,
05606   compile a resetting op-code following, except at the very end of the pattern.
05607   Return leaving the pointer at the terminating char. */
05608 
05609   if (*ptr != '|')
05610     {
05611     if (lengthptr == NULL)
05612       {
05613       int branch_length = code - last_branch;
05614       do
05615         {
05616         int prev_length = GET(last_branch, 1);
05617         PUT(last_branch, 1, branch_length);
05618         branch_length = prev_length;
05619         last_branch -= branch_length;
05620         }
05621       while (branch_length > 0);
05622       }
05623 
05624     /* Fill in the ket */
05625 
05626     *code = OP_KET;
05627     PUT(code, 1, code - start_bracket);
05628     code += 1 + LINK_SIZE;
05629 
05630     /* Resetting option if needed */
05631 
05632     if ((options & PCRE_IMS) != oldims && *ptr == ')')
05633       {
05634       *code++ = OP_OPT;
05635       *code++ = oldims;
05636       length += 2;
05637       }
05638 
05639     /* Retain the highest bracket number, in case resetting was used. */
05640 
05641     cd->bracount = max_bracount;
05642 
05643     /* Set values to pass back */
05644 
05645     *codeptr = code;
05646     *ptrptr = ptr;
05647     *firstbyteptr = firstbyte;
05648     *reqbyteptr = reqbyte;
05649     if (lengthptr != NULL)
05650       {
05651       if (OFLOW_MAX - *lengthptr < length)
05652         {
05653         *errorcodeptr = ERR20;
05654         return FALSE;
05655         }
05656       *lengthptr += length;
05657       }
05658     return TRUE;
05659     }
05660 
05661   /* Another branch follows. In the pre-compile phase, we can move the code
05662   pointer back to where it was for the start of the first branch. (That is,
05663   pretend that each branch is the only one.)
05664 
05665   In the real compile phase, insert an ALT node. Its length field points back
05666   to the previous branch while the bracket remains open. At the end the chain
05667   is reversed. It's done like this so that the start of the bracket has a
05668   zero offset until it is closed, making it possible to detect recursion. */
05669 
05670   if (lengthptr != NULL)
05671     {
05672     code = *codeptr + 1 + LINK_SIZE + skipbytes;
05673     length += 1 + LINK_SIZE;
05674     }
05675   else
05676     {
05677     *code = OP_ALT;
05678     PUT(code, 1, code - last_branch);
05679     bc.current = last_branch = code;
05680     code += 1 + LINK_SIZE;
05681     }
05682 
05683   ptr++;
05684   }
05685 /* Control never reaches here */
05686 }
05687 
05688 
05689 
05690 
05691 /*************************************************
05692 *          Check for anchored expression         *
05693 *************************************************/
05694 
05695 /* Try to find out if this is an anchored regular expression. Consider each
05696 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
05697 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
05698 it's anchored. However, if this is a multiline pattern, then only OP_SOD
05699 counts, since OP_CIRC can match in the middle.
05700 
05701 We can also consider a regex to be anchored if OP_SOM starts all its branches.
05702 This is the code for \G, which means "match at start of match position, taking
05703 into account the match offset".
05704 
05705 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
05706 because that will try the rest of the pattern at all possible matching points,
05707 so there is no point trying again.... er ....
05708 
05709 .... except when the .* appears inside capturing parentheses, and there is a
05710 subsequent back reference to those parentheses. We haven't enough information
05711 to catch that case precisely.
05712 
05713 At first, the best we could do was to detect when .* was in capturing brackets
05714 and the highest back reference was greater than or equal to that level.
05715 However, by keeping a bitmap of the first 31 back references, we can catch some
05716 of the more common cases more precisely.
05717 
05718 Arguments:
05719   code           points to start of expression (the bracket)
05720   options        points to the options setting
05721   bracket_map    a bitmap of which brackets we are inside while testing; this
05722                   handles up to substring 31; after that we just have to take
05723                   the less precise approach
05724   backref_map    the back reference bitmap
05725 
05726 Returns:     TRUE or FALSE
05727 */
05728 
05729 static BOOL
05730 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
05731   unsigned int backref_map)
05732 {
05733 do {
05734    const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
05735      options, PCRE_MULTILINE, FALSE);
05736    register int op = *scode;
05737 
05738    /* Non-capturing brackets */
05739 
05740    if (op == OP_BRA)
05741      {
05742      if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
05743      }
05744 
05745    /* Capturing brackets */
05746 
05747    else if (op == OP_CBRA)
05748      {
05749      int n = GET2(scode, 1+LINK_SIZE);
05750      int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
05751      if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
05752      }
05753 
05754    /* Other brackets */
05755 
05756    else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
05757      {
05758      if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
05759      }
05760 
05761    /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
05762    it isn't in brackets that are or may be referenced. */
05763 
05764    else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
05765              op == OP_TYPEPOSSTAR))
05766      {
05767      if (scode[1] != OP_ALLANY || (bracket_map & backref_map) != 0)
05768        return FALSE;
05769      }
05770 
05771    /* Check for explicit anchoring */
05772 
05773    else if (op != OP_SOD && op != OP_SOM &&
05774            ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
05775      return FALSE;
05776    code += GET(code, 1);
05777    }
05778 while (*code == OP_ALT);   /* Loop for each alternative */
05779 return TRUE;
05780 }
05781 
05782 
05783 
05784 /*************************************************
05785 *         Check for starting with ^ or .*        *
05786 *************************************************/
05787 
05788 /* This is called to find out if every branch starts with ^ or .* so that
05789 "first char" processing can be done to speed things up in multiline
05790 matching and for non-DOTALL patterns that start with .* (which must start at
05791 the beginning or after \n). As in the case of is_anchored() (see above), we
05792 have to take account of back references to capturing brackets that contain .*
05793 because in that case we can't make the assumption.
05794 
05795 Arguments:
05796   code           points to start of expression (the bracket)
05797   bracket_map    a bitmap of which brackets we are inside while testing; this
05798                   handles up to substring 31; after that we just have to take
05799                   the less precise approach
05800   backref_map    the back reference bitmap
05801 
05802 Returns:         TRUE or FALSE
05803 */
05804 
05805 static BOOL
05806 is_startline(const uschar *code, unsigned int bracket_map,
05807   unsigned int backref_map)
05808 {
05809 do {
05810    const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
05811      NULL, 0, FALSE);
05812    register int op = *scode;
05813 
05814    /* Non-capturing brackets */
05815 
05816    if (op == OP_BRA)
05817      {
05818      if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
05819      }
05820 
05821    /* Capturing brackets */
05822 
05823    else if (op == OP_CBRA)
05824      {
05825      int n = GET2(scode, 1+LINK_SIZE);
05826      int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
05827      if (!is_startline(scode, new_map, backref_map)) return FALSE;
05828      }
05829 
05830    /* Other brackets */
05831 
05832    else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
05833      { if (!is_startline(scode, bracket_map, backref_map)) return FALSE; }
05834 
05835    /* .* means "start at start or after \n" if it isn't in brackets that
05836    may be referenced. */
05837 
05838    else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
05839      {
05840      if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
05841      }
05842 
05843    /* Check for explicit circumflex */
05844 
05845    else if (op != OP_CIRC) return FALSE;
05846 
05847    /* Move on to the next alternative */
05848 
05849    code += GET(code, 1);
05850    }
05851 while (*code == OP_ALT);  /* Loop for each alternative */
05852 return TRUE;
05853 }
05854 
05855 
05856 
05857 /*************************************************
05858 *       Check for asserted fixed first char      *
05859 *************************************************/
05860 
05861 /* During compilation, the "first char" settings from forward assertions are
05862 discarded, because they can cause conflicts with actual literals that follow.
05863 However, if we end up without a first char setting for an unanchored pattern,
05864 it is worth scanning the regex to see if there is an initial asserted first
05865 char. If all branches start with the same asserted char, or with a bracket all
05866 of whose alternatives start with the same asserted char (recurse ad lib), then
05867 we return that char, otherwise -1.
05868 
05869 Arguments:
05870   code       points to start of expression (the bracket)
05871   options    pointer to the options (used to check casing changes)
05872   inassert   TRUE if in an assertion
05873 
05874 Returns:     -1 or the fixed first char
05875 */
05876 
05877 static int
05878 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
05879 {
05880 register int c = -1;
05881 do {
05882    int d;
05883    const uschar *scode =
05884      first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
05885    register int op = *scode;
05886 
05887    switch(op)
05888      {
05889      default:
05890      return -1;
05891 
05892      case OP_BRA:
05893      case OP_CBRA:
05894      case OP_ASSERT:
05895      case OP_ONCE:
05896      case OP_COND:
05897      if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
05898        return -1;
05899      if (c < 0) c = d; else if (c != d) return -1;
05900      break;
05901 
05902      case OP_EXACT:       /* Fall through */
05903      scode += 2;
05904 
05905      case OP_CHAR:
05906      case OP_CHARNC:
05907      case OP_PLUS:
05908      case OP_MINPLUS:
05909      case OP_POSPLUS:
05910      if (!inassert) return -1;
05911      if (c < 0)
05912        {
05913        c = scode[1];
05914        if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
05915        }
05916      else if (c != scode[1]) return -1;
05917      break;
05918      }
05919 
05920    code += GET(code, 1);
05921    }
05922 while (*code == OP_ALT);
05923 return c;
05924 }
05925 
05926 
05927 
05928 /*************************************************
05929 *        Compile a Regular Expression            *
05930 *************************************************/
05931 
05932 /* This function takes a string and returns a pointer to a block of store
05933 holding a compiled version of the expression. The original API for this
05934 function had no error code return variable; it is retained for backwards
05935 compatibility. The new function is given a new name.
05936 
05937 Arguments:
05938   pattern       the regular expression
05939   options       various option bits
05940   errorcodeptr  pointer to error code variable (pcre_compile2() only)
05941                   can be NULL if you don't want a code value
05942   errorptr      pointer to pointer to error text
05943   erroroffset   ptr offset in pattern where error was detected
05944   tables        pointer to character tables or NULL
05945 
05946 Returns:        pointer to compiled data block, or NULL on error,
05947                 with errorptr and erroroffset set
05948 */
05949 
05950 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
05951 pcre_compile(const char *pattern, int options, const char **errorptr,
05952   int *erroroffset, const unsigned char *tables)
05953 {
05954 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
05955 }
05956 
05957 
05958 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
05959 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
05960   const char **errorptr, int *erroroffset, const unsigned char *tables)
05961 {
05962 real_pcre *re;
05963 int length = 1;  /* For final END opcode */
05964 int firstbyte, reqbyte, newline;
05965 int errorcode = 0;
05966 int skipatstart = 0;
05967 #ifdef SUPPORT_UTF8
05968 BOOL utf8;
05969 #endif
05970 size_t size;
05971 uschar *code;
05972 const uschar *codestart;
05973 const uschar *ptr;
05974 compile_data compile_block;
05975 compile_data *cd = &compile_block;
05976 
05977 /* This space is used for "compiling" into during the first phase, when we are
05978 computing the amount of memory that is needed. Compiled items are thrown away
05979 as soon as possible, so that a fairly large buffer should be sufficient for
05980 this purpose. The same space is used in the second phase for remembering where
05981 to fill in forward references to subpatterns. */
05982 
05983 uschar cworkspace[COMPILE_WORK_SIZE];
05984 
05985 /* Set this early so that early errors get offset 0. */
05986 
05987 ptr = (const uschar *)pattern;
05988 
05989 /* We can't pass back an error message if errorptr is NULL; I guess the best we
05990 can do is just return NULL, but we can set a code value if there is a code
05991 pointer. */
05992 
05993 if (errorptr == NULL)
05994   {
05995   if (errorcodeptr != NULL) *errorcodeptr = 99;
05996   return NULL;
05997   }
05998 
05999 *errorptr = NULL;
06000 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
06001 
06002 /* However, we can give a message for this error */
06003 
06004 if (erroroffset == NULL)
06005   {
06006   errorcode = ERR16;
06007   goto PCRE_EARLY_ERROR_RETURN2;
06008   }
06009 
06010 *erroroffset = 0;
06011 
06012 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
06013 
06014 #ifdef SUPPORT_UTF8
06015 utf8 = (options & PCRE_UTF8) != 0;
06016 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
06017      (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
06018   {
06019   errorcode = ERR44;
06020   goto PCRE_EARLY_ERROR_RETURN2;
06021   }
06022 #else
06023 if ((options & PCRE_UTF8) != 0)
06024   {
06025   errorcode = ERR32;
06026   goto PCRE_EARLY_ERROR_RETURN;
06027   }
06028 #endif
06029 
06030 if ((options & ~PUBLIC_OPTIONS) != 0)
06031   {
06032   errorcode = ERR17;
06033   goto PCRE_EARLY_ERROR_RETURN;
06034   }
06035 
06036 /* Set up pointers to the individual character tables */
06037 
06038 if (tables == NULL) tables = _pcre_default_tables;
06039 cd->lcc = tables + lcc_offset;
06040 cd->fcc = tables + fcc_offset;
06041 cd->cbits = tables + cbits_offset;
06042 cd->ctypes = tables + ctypes_offset;
06043 
06044 /* Check for global one-time settings at the start of the pattern, and remember
06045 the offset for later. */
06046 
06047 while (ptr[skipatstart] == '(' && ptr[skipatstart+1] == '*')
06048   {
06049   int newnl = 0;
06050   int newbsr = 0;
06051 
06052   if (strncmp((char *)(ptr+skipatstart+2), "CR)", 3) == 0)
06053     { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
06054   else if (strncmp((char *)(ptr+skipatstart+2), "LF)", 3)  == 0)
06055     { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
06056   else if (strncmp((char *)(ptr+skipatstart+2), "CRLF)", 5)  == 0)
06057     { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
06058   else if (strncmp((char *)(ptr+skipatstart+2), "ANY)", 4) == 0)
06059     { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
06060   else if (strncmp((char *)(ptr+skipatstart+2), "ANYCRLF)", 8)  == 0)
06061     { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
06062 
06063   else if (strncmp((char *)(ptr+skipatstart+2), "BSR_ANYCRLF)", 12) == 0)
06064     { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
06065   else if (strncmp((char *)(ptr+skipatstart+2), "BSR_UNICODE)", 12) == 0)
06066     { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
06067 
06068   if (newnl != 0)
06069     options = (options & ~PCRE_NEWLINE_BITS) | newnl;
06070   else if (newbsr != 0)
06071     options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
06072   else break;
06073   }
06074 
06075 /* Check validity of \R options. */
06076 
06077 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
06078   {
06079   case 0:
06080   case PCRE_BSR_ANYCRLF:
06081   case PCRE_BSR_UNICODE:
06082   break;
06083   default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
06084   }
06085 
06086 /* Handle different types of newline. The three bits give seven cases. The
06087 current code allows for fixed one- or two-byte sequences, plus "any" and
06088 "anycrlf". */
06089 
06090 switch (options & PCRE_NEWLINE_BITS)
06091   {
06092   case 0: newline = NEWLINE; break;   /* Build-time default */
06093   case PCRE_NEWLINE_CR: newline = '\r'; break;
06094   case PCRE_NEWLINE_LF: newline = '\n'; break;
06095   case PCRE_NEWLINE_CR+
06096        PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
06097   case PCRE_NEWLINE_ANY: newline = -1; break;
06098   case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
06099   default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
06100   }
06101 
06102 if (newline == -2)
06103   {
06104   cd->nltype = NLTYPE_ANYCRLF;
06105   }
06106 else if (newline < 0)
06107   {
06108   cd->nltype = NLTYPE_ANY;
06109   }
06110 else
06111   {
06112   cd->nltype = NLTYPE_FIXED;
06113   if (newline > 255)
06114     {
06115     cd->nllen = 2;
06116     cd->nl[0] = (newline >> 8) & 255;
06117     cd->nl[1] = newline & 255;
06118     }
06119   else
06120     {
06121     cd->nllen = 1;
06122     cd->nl[0] = newline;
06123     }
06124   }
06125 
06126 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
06127 references to help in deciding whether (.*) can be treated as anchored or not.
06128 */
06129 
06130 cd->top_backref = 0;
06131 cd->backref_map = 0;
06132 
06133 /* Reflect pattern for debugging output */
06134 
06135 DPRINTF(("------------------------------------------------------------------\n"));
06136 DPRINTF(("%s\n", pattern));
06137 
06138 /* Pretend to compile the pattern while actually just accumulating the length
06139 of memory required. This behaviour is triggered by passing a non-NULL final
06140 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
06141 to compile parts of the pattern into; the compiled code is discarded when it is
06142 no longer needed, so hopefully this workspace will never overflow, though there
06143 is a test for its doing so. */
06144 
06145 cd->bracount = cd->final_bracount = 0;
06146 cd->names_found = 0;
06147 cd->name_entry_size = 0;
06148 cd->name_table = NULL;
06149 cd->start_workspace = cworkspace;
06150 cd->start_code = cworkspace;
06151 cd->hwm = cworkspace;
06152 cd->start_pattern = (const uschar *)pattern;
06153 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
06154 cd->req_varyopt = 0;
06155 cd->external_options = options;
06156 cd->external_flags = 0;
06157 
06158 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
06159 don't need to look at the result of the function here. The initial options have
06160 been put into the cd block so that they can be changed if an option setting is
06161 found within the regex right at the beginning. Bringing initial option settings
06162 outside can help speed up starting point checks. */
06163 
06164 ptr += skipatstart;
06165 code = cworkspace;
06166 *code = OP_BRA;
06167 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
06168   &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
06169   &length);
06170 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
06171 
06172 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
06173   cd->hwm - cworkspace));
06174 
06175 if (length > MAX_PATTERN_SIZE)
06176   {
06177   errorcode = ERR20;
06178   goto PCRE_EARLY_ERROR_RETURN;
06179   }
06180 
06181 /* Compute the size of data block needed and get it, either from malloc or
06182 externally provided function. Integer overflow should no longer be possible
06183 because nowadays we limit the maximum value of cd->names_found and
06184 cd->name_entry_size. */
06185 
06186 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
06187 re = (real_pcre *)(pcre_malloc)(size);
06188 
06189 if (re == NULL)
06190   {
06191   errorcode = ERR21;
06192   goto PCRE_EARLY_ERROR_RETURN;
06193   }
06194 
06195 /* Put in the magic number, and save the sizes, initial options, internal
06196 flags, and character table pointer. NULL is used for the default character
06197 tables. The nullpad field is at the end; it's there to help in the case when a
06198 regex compiled on a system with 4-byte pointers is run on another with 8-byte
06199 pointers. */
06200 
06201 re->magic_number = MAGIC_NUMBER;
06202 re->size = size;
06203 re->options = cd->external_options;
06204 re->flags = cd->external_flags;
06205 re->dummy1 = 0;
06206 re->first_byte = 0;
06207 re->req_byte = 0;
06208 re->name_table_offset = sizeof(real_pcre);
06209 re->name_entry_size = cd->name_entry_size;
06210 re->name_count = cd->names_found;
06211 re->ref_count = 0;
06212 re->tables = (tables == _pcre_default_tables)? NULL : tables;
06213 re->nullpad = NULL;
06214 
06215 /* The starting points of the name/number translation table and of the code are
06216 passed around in the compile data block. The start/end pattern and initial
06217 options are already set from the pre-compile phase, as is the name_entry_size
06218 field. Reset the bracket count and the names_found field. Also reset the hwm
06219 field; this time it's used for remembering forward references to subpatterns.
06220 */
06221 
06222 cd->final_bracount = cd->bracount;  /* Save for checking forward references */
06223 cd->bracount = 0;
06224 cd->names_found = 0;
06225 cd->name_table = (uschar *)re + re->name_table_offset;
06226 codestart = cd->name_table + re->name_entry_size * re->name_count;
06227 cd->start_code = codestart;
06228 cd->hwm = cworkspace;
06229 cd->req_varyopt = 0;
06230 cd->had_accept = FALSE;
06231 
06232 /* Set up a starting, non-extracting bracket, then compile the expression. On
06233 error, errorcode will be set non-zero, so we don't need to look at the result
06234 of the function here. */
06235 
06236 ptr = (const uschar *)pattern + skipatstart;
06237 code = (uschar *)codestart;
06238 *code = OP_BRA;
06239 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
06240   &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
06241 re->top_bracket = cd->bracount;
06242 re->top_backref = cd->top_backref;
06243 re->flags = cd->external_flags;
06244 
06245 if (cd->had_accept) reqbyte = -1;   /* Must disable after (*ACCEPT) */
06246 
06247 /* If not reached end of pattern on success, there's an excess bracket. */
06248 
06249 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
06250 
06251 /* Fill in the terminating state and check for disastrous overflow, but
06252 if debugging, leave the test till after things are printed out. */
06253 
06254 *code++ = OP_END;
06255 
06256 #ifndef DEBUG
06257 if (code - codestart > length) errorcode = ERR23;
06258 #endif
06259 
06260 /* Fill in any forward references that are required. */
06261 
06262 while (errorcode == 0 && cd->hwm > cworkspace)
06263   {
06264   int offset, recno;
06265   const uschar *groupptr;
06266   cd->hwm -= LINK_SIZE;
06267   offset = GET(cd->hwm, 0);
06268   recno = GET(codestart, offset);
06269   groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
06270   if (groupptr == NULL) errorcode = ERR53;
06271     else PUT(((uschar *)codestart), offset, groupptr - codestart);
06272   }
06273 
06274 /* Give an error if there's back reference to a non-existent capturing
06275 subpattern. */
06276 
06277 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
06278 
06279 /* Failed to compile, or error while post-processing */
06280 
06281 if (errorcode != 0)
06282   {
06283   (pcre_free)(re);
06284   PCRE_EARLY_ERROR_RETURN:
06285   *erroroffset = ptr - (const uschar *)pattern;
06286   PCRE_EARLY_ERROR_RETURN2:
06287   *errorptr = find_error_text(errorcode);
06288   if (errorcodeptr != NULL) *errorcodeptr = errorcode;
06289   return NULL;
06290   }
06291 
06292 /* If the anchored option was not passed, set the flag if we can determine that
06293 the pattern is anchored by virtue of ^ characters or \A or anything else (such
06294 as starting with .* when DOTALL is set).
06295 
06296 Otherwise, if we know what the first byte has to be, save it, because that
06297 speeds up unanchored matches no end. If not, see if we can set the
06298 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
06299 start with ^. and also when all branches start with .* for non-DOTALL matches.
06300 */
06301 
06302 if ((re->options & PCRE_ANCHORED) == 0)
06303   {
06304   int temp_options = re->options;   /* May get changed during these scans */
06305   if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
06306     re->options |= PCRE_ANCHORED;
06307   else
06308     {
06309     if (firstbyte < 0)
06310       firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
06311     if (firstbyte >= 0)   /* Remove caseless flag for non-caseable chars */
06312       {
06313       int ch = firstbyte & 255;
06314       re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
06315          cd->fcc[ch] == ch)? ch : firstbyte;
06316       re->flags |= PCRE_FIRSTSET;
06317       }
06318     else if (is_startline(codestart, 0, cd->backref_map))
06319       re->flags |= PCRE_STARTLINE;
06320     }
06321   }
06322 
06323 /* For an anchored pattern, we use the "required byte" only if it follows a
06324 variable length item in the regex. Remove the caseless flag for non-caseable
06325 bytes. */
06326 
06327 if (reqbyte >= 0 &&
06328      ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
06329   {
06330   int ch = reqbyte & 255;
06331   re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
06332     cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
06333   re->flags |= PCRE_REQCHSET;
06334   }
06335 
06336 /* Print out the compiled data if debugging is enabled. This is never the
06337 case when building a production library. */
06338 
06339 #ifdef DEBUG
06340 
06341 printf("Length = %d top_bracket = %d top_backref = %d\n",
06342   length, re->top_bracket, re->top_backref);
06343 
06344 printf("Options=%08x\n", re->options);
06345 
06346 if ((re->flags & PCRE_FIRSTSET) != 0)
06347   {
06348   int ch = re->first_byte & 255;
06349   const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
06350     "" : " (caseless)";
06351   if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
06352     else printf("First char = \\x%02x%s\n", ch, caseless);
06353   }
06354 
06355 if ((re->flags & PCRE_REQCHSET) != 0)
06356   {
06357   int ch = re->req_byte & 255;
06358   const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
06359     "" : " (caseless)";
06360   if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
06361     else printf("Req char = \\x%02x%s\n", ch, caseless);
06362   }
06363 
06364 pcre_printint(re, stdout, TRUE);
06365 
06366 /* This check is done here in the debugging case so that the code that
06367 was compiled can be seen. */
06368 
06369 if (code - codestart > length)
06370   {
06371   (pcre_free)(re);
06372   *errorptr = find_error_text(ERR23);
06373   *erroroffset = ptr - (uschar *)pattern;
06374   if (errorcodeptr != NULL) *errorcodeptr = ERR23;
06375   return NULL;
06376   }
06377 #endif   /* DEBUG */
06378 
06379 return (pcre *)re;
06380 }
06381 
06382 /* End of pcre_compile.c */

Generated on Tue Jul 5 14:11:57 2011 for ROOT_528-00b_version by  doxygen 1.5.1