00001 #include "Riostream.h" 00002 #include "TString.h" 00003 #include "TPRegexp.h" 00004 #include "TClonesArray.h" 00005 #include "TObjString.h" 00006 00007 //------------------------------------------------------------------------------------------- 00008 // 00009 // A regular expression, often called a pattern, is an expression that describes a set of 00010 // strings. They are usually used to give a concise description of a set, without having to 00011 // list all elements. 00012 // The Unix utilities like sed and grep make extensive use of regular expressions. Scripting 00013 // languages like Perl have regular expression engines built directly into their syntax . 00014 // 00015 // Extensive documentation about Regular expressions in Perl can be 00016 // found at : 00017 // http://perldoc.perl.org/perlre.html 00018 // 00019 // ROOT has this capability through the use of the P(erl) C(ompatible) R(egular) E(xpression) 00020 // - library, PCRE, see http://www.pcre.org 00021 // 00022 // Its functionality can be accessed through the TPRegexp and TString class . 00023 // Note that in patterns taken from Perl all backslash character have to be replaced in the 00024 // C/C++ strings by two backslashes . 00025 // 00026 // This macro shows several ways how to use the Match/Substitute capabilities of the 00027 // the TPRegexp class . It can be run as follows : 00028 // .x regexp.C 00029 // 00030 // Author: Eddy Offermann 00031 // 00032 //------------------------------------------------------------------------------------------- 00033 00034 void regexp() 00035 { 00036 // Substitute example : 00037 // Find a word that starts with "peper" and ends with "koek" . 00038 00039 TString s1("lekkere pepernotenkoek"); 00040 TPRegexp r1("\\bpeper(\\w+)koek\\b"); 00041 00042 // Note that the TString class gives access to some of the simpler TPRegexp functionality . 00043 // The following command returns the fully matched string . 00044 cout << s1(r1) << endl; 00045 00046 // In the "Substitute" command, keep the middle part (indicated in the regexp by "(\\w+)" 00047 // and the substitute string by "$1") and sandwich it between "wal" and "boom" . 00048 r1.Substitute(s1,"wal$1boom"); 00049 cout << s1 << endl; 00050 00051 // Substitute example : 00052 // Swap first two words in a string 00053 00054 TString s2("one two three"); 00055 TPRegexp("^([^ ]+) +([^ ]+)").Substitute(s2,"$2 $1"); 00056 cout << s2 << endl; 00057 00058 // Substitute example : 00059 // $1, $2, and so on, in the substitute string are equivalent to whatever the corresponding set 00060 // of parentheses match in the regexp string, counting opening parentheses from left to right . 00061 // In the following example, we are trying to catch a date MMDDYYYY in a string and rearrange 00062 // it to DDMMYYY . "(\\d{1,2}) matches only 1 or 2 digits etc . 00063 00064 TString s3("on 09/24/1959 the world stood still"); 00065 TPRegexp("\\b(\\d{1,2})/(\\d{1,2})/(\\d{4})\\b").Substitute(s3,"$2-$1-$3"); 00066 cout << s3 << endl; 00067 00068 // Match Example : 00069 // The following example shows how to extract a protocol and port number from an URL string . 00070 // Note again the parentheses in the regexp string : "(\\w+)" requires a non-empty 00071 // alphanumeric string while "(\\d+)" wants a pure digital string . 00072 // The matched substrings together with the full matched string are returned in a 00073 // TObjArray . The first entry is the full string while next entries are the substrings 00074 // in the order as listed in the regexp string . 00075 // 00076 // Note that there is also a Match(..) command that returns the positions of the 00077 // substrings in the input string . 00078 00079 TString s4("http://fink.sourceforge.net:8080/index/readme.html"); 00080 TObjArray *subStrL = TPRegexp("^(\\w+)://[^/]+:(\\d+)/$").MatchS(s4); 00081 const Int_t nrSubStr = subStrL->GetLast()+1; 00082 if (nrSubStr > 2) { 00083 const TString proto = ((TObjString *)subStrL->At(1))->GetString(); 00084 const TString port = ((TObjString *)subStrL->At(2))->GetString(); 00085 cout << "protocol: " << proto << " port: " << port << endl; 00086 } 00087 00088 // Match Example : 00089 // This example returns kTRUE if the email address is valid . For that it has to fulfill the following 00090 // criteria: 00091 // 1) It should be of the form string1@string2 . The "^" and "$" ensure that we compare the complete 00092 // email string 00093 // 2) ([\\w-\\.]+) : 00094 // string1 is only allowed to be composed out of the alphanumeric characters, "-" and "." . 00095 // The "+" ensures that string1 can not be empty . 00096 // 3) string2 is matched against three different parts : 00097 // a. ((\\[[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.)|(([\\w-]+\\.)+)) : 00098 // This regular expression ensures that EITHER the string starts with "[" followed by three groups 00099 // of numbers, separated by "." , where each group has 1 to 3 numbers, OR alphanumeric strings, 00100 // possibly containing "-" characters, seperated by "." . 00101 // b. ([a-zA-Z]{2,4}|[0-9]{1,3}) : 00102 // This part contains EITHER 2 to 4 alpha characters OR 1 to 3 numbers 00103 // c. (\\]?) : 00104 // At most one "]" character . 00105 00106 TString s5("fons.rademakers@cern.ch"); 00107 TPRegexp r5("^([\\w-\\.]+)@((\\[[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.)|(([\\w-]+\\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\\]?)$"); 00108 cout << "Check if the email address \"" << s5 << "\" is valid: " << (r5.MatchB(s5) ? "TRUE" : "FALSE") << endl; 00109 00110 // Substitute Example with pattern modifier : 00111 // Like in Perl, Substitute/Match commands accept modifier arguments . For instance a "g" modifier causes to 00112 // match the regexp globally . In the example below, all words starting and ending with the character "n" 00113 // are replaced by the word neutrino . 00114 00115 TString s6("neutron proton electron neutron"); 00116 TPRegexp("(n\\w+n)").Substitute(s6,"neutrino","g"); 00117 cout << s6 << endl; 00118 }