00001 /************************************************* 00002 * Perl-Compatible Regular Expressions * 00003 *************************************************/ 00004 00005 /* PCRE is a library of functions to support regular expressions whose syntax 00006 and semantics are as close as possible to those of the Perl 5 language. 00007 00008 Written by Philip Hazel 00009 Copyright (c) 1997-2008 University of Cambridge 00010 00011 ----------------------------------------------------------------------------- 00012 Redistribution and use in source and binary forms, with or without 00013 modification, are permitted provided that the following conditions are met: 00014 00015 * Redistributions of source code must retain the above copyright notice, 00016 this list of conditions and the following disclaimer. 00017 00018 * Redistributions in binary form must reproduce the above copyright 00019 notice, this list of conditions and the following disclaimer in the 00020 documentation and/or other materials provided with the distribution. 00021 00022 * Neither the name of the University of Cambridge nor the names of its 00023 contributors may be used to endorse or promote products derived from 00024 this software without specific prior written permission. 00025 00026 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 00027 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 00028 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 00029 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 00030 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 00031 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 00032 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 00033 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 00034 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 00035 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00036 POSSIBILITY OF SUCH DAMAGE. 00037 ----------------------------------------------------------------------------- 00038 */ 00039 00040 00041 /* This module contains an internal function for validating UTF-8 character 00042 strings. */ 00043 00044 00045 #ifdef HAVE_CONFIG_H 00046 #include "config.h" 00047 #endif 00048 00049 #include "pcre_internal.h" 00050 00051 00052 /************************************************* 00053 * Validate a UTF-8 string * 00054 *************************************************/ 00055 00056 /* This function is called (optionally) at the start of compile or match, to 00057 validate that a supposed UTF-8 string is actually valid. The early check means 00058 that subsequent code can assume it is dealing with a valid string. The check 00059 can be turned off for maximum performance, but the consequences of supplying 00060 an invalid string are then undefined. 00061 00062 Originally, this function checked according to RFC 2279, allowing for values in 00063 the range 0 to 0x7fffffff, up to 6 bytes long, but ensuring that they were in 00064 the canonical format. Once somebody had pointed out RFC 3629 to me (it 00065 obsoletes 2279), additional restrictions were applied. The values are now 00066 limited to be between 0 and 0x0010ffff, no more than 4 bytes long, and the 00067 subrange 0xd000 to 0xdfff is excluded. 00068 00069 Arguments: 00070 string points to the string 00071 length length of string, or -1 if the string is zero-terminated 00072 00073 Returns: < 0 if the string is a valid UTF-8 string 00074 >= 0 otherwise; the value is the offset of the bad byte 00075 */ 00076 00077 int 00078 _pcre_valid_utf8(const uschar *string, int length) 00079 { 00080 #ifdef SUPPORT_UTF8 00081 register const uschar *p; 00082 00083 if (length < 0) 00084 { 00085 for (p = string; *p != 0; p++); 00086 length = p - string; 00087 } 00088 00089 for (p = string; length-- > 0; p++) 00090 { 00091 register int ab; 00092 register int c = *p; 00093 if (c < 128) continue; 00094 if (c < 0xc0) return p - string; 00095 ab = _pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ 00096 if (length < ab || ab > 3) return p - string; 00097 length -= ab; 00098 00099 /* Check top bits in the second byte */ 00100 if ((*(++p) & 0xc0) != 0x80) return p - string; 00101 00102 /* Check for overlong sequences for each different length, and for the 00103 excluded range 0xd000 to 0xdfff. */ 00104 00105 switch (ab) 00106 { 00107 /* Check for xx00 000x (overlong sequence) */ 00108 00109 case 1: 00110 if ((c & 0x3e) == 0) return p - string; 00111 continue; /* We know there aren't any more bytes to check */ 00112 00113 /* Check for 1110 0000, xx0x xxxx (overlong sequence) or 00114 1110 1101, 1010 xxxx (0xd000 - 0xdfff) */ 00115 00116 case 2: 00117 if ((c == 0xe0 && (*p & 0x20) == 0) || 00118 (c == 0xed && *p >= 0xa0)) 00119 return p - string; 00120 break; 00121 00122 /* Check for 1111 0000, xx00 xxxx (overlong sequence) or 00123 greater than 0x0010ffff (f4 8f bf bf) */ 00124 00125 case 3: 00126 if ((c == 0xf0 && (*p & 0x30) == 0) || 00127 (c > 0xf4 ) || 00128 (c == 0xf4 && *p > 0x8f)) 00129 return p - string; 00130 break; 00131 00132 #if 0 00133 /* These cases can no longer occur, as we restrict to a maximum of four 00134 bytes nowadays. Leave the code here in case we ever want to add an option 00135 for longer sequences. */ 00136 00137 /* Check for 1111 1000, xx00 0xxx */ 00138 case 4: 00139 if (c == 0xf8 && (*p & 0x38) == 0) return p - string; 00140 break; 00141 00142 /* Check for leading 0xfe or 0xff, and then for 1111 1100, xx00 00xx */ 00143 case 5: 00144 if (c == 0xfe || c == 0xff || 00145 (c == 0xfc && (*p & 0x3c) == 0)) return p - string; 00146 break; 00147 #endif 00148 00149 } 00150 00151 /* Check for valid bytes after the 2nd, if any; all must start 10 */ 00152 while (--ab > 0) 00153 { 00154 if ((*(++p) & 0xc0) != 0x80) return p - string; 00155 } 00156 } 00157 #else 00158 (void)(string); /* Keep picky compilers happy */ 00159 (void)(length); 00160 #endif 00161 00162 return -1; 00163 } 00164 00165 /* End of pcre_valid_utf8.c */