00001 // Copyright (c) 2005, Google Inc. 00002 // All rights reserved. 00003 // 00004 // Redistribution and use in source and binary forms, with or without 00005 // modification, are permitted provided that the following conditions are 00006 // met: 00007 // 00008 // * Redistributions of source code must retain the above copyright 00009 // notice, this list of conditions and the following disclaimer. 00010 // * Redistributions in binary form must reproduce the above 00011 // copyright notice, this list of conditions and the following disclaimer 00012 // in the documentation and/or other materials provided with the 00013 // distribution. 00014 // * Neither the name of Google Inc. nor the names of its 00015 // contributors may be used to endorse or promote products derived from 00016 // this software without specific prior written permission. 00017 // 00018 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00019 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00020 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 00021 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 00022 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00023 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 00024 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 00025 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 00026 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 00027 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 00028 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00029 // 00030 // Author: Sanjay Ghemawat 00031 // 00032 // Regular-expression based scanner for parsing an input stream. 00033 // 00034 // Example 1: parse a sequence of "var = number" entries from input: 00035 // 00036 // Scanner scanner(input); 00037 // string var; 00038 // int number; 00039 // scanner.SetSkipExpression("\\s+"); // Skip any white space we encounter 00040 // while (scanner.Consume("(\\w+) = (\\d+)", &var, &number)) { 00041 // ...; 00042 // } 00043 00044 #ifndef _PCRE_SCANNER_H 00045 #define _PCRE_SCANNER_H 00046 00047 #include <assert.h> 00048 #include <string> 00049 #include <vector> 00050 00051 #include <pcrecpp.h> 00052 #include <pcre_stringpiece.h> 00053 00054 namespace pcrecpp { 00055 00056 class PCRECPP_EXP_DEFN Scanner { 00057 public: 00058 Scanner(); 00059 explicit Scanner(const std::string& input); 00060 ~Scanner(); 00061 00062 // Return current line number. The returned line-number is 00063 // one-based. I.e. it returns 1 + the number of consumed newlines. 00064 // 00065 // Note: this method may be slow. It may take time proportional to 00066 // the size of the input. 00067 int LineNumber() const; 00068 00069 // Return the byte-offset that the scanner is looking in the 00070 // input data; 00071 int Offset() const; 00072 00073 // Return true iff the start of the remaining input matches "re" 00074 bool LookingAt(const RE& re) const; 00075 00076 // Return true iff all of the following are true 00077 // a. the start of the remaining input matches "re", 00078 // b. if any arguments are supplied, matched sub-patterns can be 00079 // parsed and stored into the arguments. 00080 // If it returns true, it skips over the matched input and any 00081 // following input that matches the "skip" regular expression. 00082 bool Consume(const RE& re, 00083 const Arg& arg0 = RE::no_arg, 00084 const Arg& arg1 = RE::no_arg, 00085 const Arg& arg2 = RE::no_arg 00086 // TODO: Allow more arguments? 00087 ); 00088 00089 // Set the "skip" regular expression. If after consuming some data, 00090 // a prefix of the input matches this RE, it is automatically 00091 // skipped. For example, a programming language scanner would use 00092 // a skip RE that matches white space and comments. 00093 // 00094 // scanner.SetSkipExpression("\\s+|//.*|/[*](.|\n)*?[*]/"); 00095 // 00096 // Skipping repeats as long as it succeeds. We used to let people do 00097 // this by writing "(...)*" in the regular expression, but that added 00098 // up to lots of recursive calls within the pcre library, so now we 00099 // control repetition explicitly via the function call API. 00100 // 00101 // You can pass NULL for "re" if you do not want any data to be skipped. 00102 void Skip(const char* re); // DEPRECATED; does *not* repeat 00103 void SetSkipExpression(const char* re); 00104 00105 // Temporarily pause "skip"ing. This 00106 // Skip("Foo"); code ; DisableSkip(); code; EnableSkip() 00107 // is similar to 00108 // Skip("Foo"); code ; Skip(NULL); code ; Skip("Foo"); 00109 // but avoids creating/deleting new RE objects. 00110 void DisableSkip(); 00111 00112 // Reenable previously paused skipping. Any prefix of the input 00113 // that matches the skip pattern is immediately dropped. 00114 void EnableSkip(); 00115 00116 /***** Special wrappers around SetSkip() for some common idioms *****/ 00117 00118 // Arranges to skip whitespace, C comments, C++ comments. 00119 // The overall RE is a disjunction of the following REs: 00120 // \\s whitespace 00121 // //.*\n C++ comment 00122 // /[*](.|\n)*?[*]/ C comment (x*? means minimal repetitions of x) 00123 // We get repetition via the semantics of SetSkipExpression, not by using * 00124 void SkipCXXComments() { 00125 SetSkipExpression("\\s|//.*\n|/[*](?:\n|.)*?[*]/"); 00126 } 00127 00128 void set_save_comments(bool comments) { 00129 save_comments_ = comments; 00130 } 00131 00132 bool save_comments() { 00133 return save_comments_; 00134 } 00135 00136 // Append to vector ranges the comments found in the 00137 // byte range [start,end] (inclusive) of the input data. 00138 // Only comments that were extracted entirely within that 00139 // range are returned: no range splitting of atomically-extracted 00140 // comments is performed. 00141 void GetComments(int start, int end, std::vector<StringPiece> *ranges); 00142 00143 // Append to vector ranges the comments added 00144 // since the last time this was called. This 00145 // functionality is provided for efficiency when 00146 // interleaving scanning with parsing. 00147 void GetNextComments(std::vector<StringPiece> *ranges); 00148 00149 private: 00150 std::string data_; // All the input data 00151 StringPiece input_; // Unprocessed input 00152 RE* skip_; // If non-NULL, RE for skipping input 00153 bool should_skip_; // If true, use skip_ 00154 bool skip_repeat_; // If true, repeat skip_ as long as it works 00155 bool save_comments_; // If true, aggregate the skip expression 00156 00157 // the skipped comments 00158 // TODO: later consider requiring that the StringPieces be added 00159 // in order by their start position 00160 std::vector<StringPiece> *comments_; 00161 00162 // the offset into comments_ that has been returned by GetNextComments 00163 int comments_offset_; 00164 00165 // helper function to consume *skip_ and honour 00166 // save_comments_ 00167 void ConsumeSkip(); 00168 }; 00169 00170 } // namespace pcrecpp 00171 00172 #endif /* _PCRE_SCANNER_H */