regexp.C

Go to the documentation of this file.
00001 #include "Riostream.h"
00002 #include "TString.h"
00003 #include "TPRegexp.h"
00004 #include "TClonesArray.h"
00005 #include "TObjString.h"
00006 
00007 //-------------------------------------------------------------------------------------------
00008 //
00009 // A regular expression, often called a pattern, is an expression that describes a set of
00010 // strings. They are usually used to give a concise description of a set, without having to
00011 // list all elements.
00012 // The Unix utilities like sed and grep make extensive use of regular expressions. Scripting
00013 // languages like Perl have regular expression engines built directly into their syntax .
00014 //
00015 // Extensive documentation about Regular expressions in Perl can be
00016 // found at :
00017 //              http://perldoc.perl.org/perlre.html
00018 //
00019 // ROOT has this capability through the use of the P(erl) C(ompatible) R(egular) E(xpression)
00020 //  - library, PCRE, see http://www.pcre.org
00021 //
00022 // Its functionality can be accessed through the TPRegexp and TString class .
00023 // Note that in patterns taken from Perl all backslash character have to be replaced in the
00024 // C/C++ strings by two backslashes .
00025 //
00026 // This macro shows several ways how to use the Match/Substitute capabilities of the
00027 // the TPRegexp class . It can be run as follows :
00028 //     .x regexp.C
00029 //
00030 // Author: Eddy Offermann
00031 //
00032 //-------------------------------------------------------------------------------------------
00033 
00034 void regexp()
00035 {
00036    // Substitute example :
00037    // Find a word that starts with "peper" and ends with "koek" .
00038  
00039    TString s1("lekkere pepernotenkoek");
00040    TPRegexp r1("\\bpeper(\\w+)koek\\b");
00041 
00042    // Note that the TString class gives access to some of the simpler TPRegexp functionality .
00043    // The following command returns the fully matched string .
00044    cout << s1(r1) << endl;
00045 
00046    // In the "Substitute" command, keep the middle part (indicated in the regexp by "(\\w+)"
00047    // and the substitute string by "$1") and sandwich it between "wal" and "boom" .
00048    r1.Substitute(s1,"wal$1boom");
00049    cout << s1 << endl;
00050 
00051    // Substitute example :
00052    // Swap first two words in a string
00053 
00054    TString s2("one two three");
00055    TPRegexp("^([^ ]+) +([^ ]+)").Substitute(s2,"$2 $1");
00056    cout << s2 << endl;
00057 
00058    // Substitute example :
00059    // $1, $2, and so on, in the substitute string are equivalent to whatever the corresponding set
00060    // of parentheses match in the regexp string, counting opening parentheses from left to right .
00061    // In the following example, we are trying to catch a date MMDDYYYY in a string and rearrange
00062    // it to DDMMYYY . "(\\d{1,2}) matches only 1 or 2 digits etc .
00063  
00064    TString s3("on 09/24/1959 the world stood still");
00065    TPRegexp("\\b(\\d{1,2})/(\\d{1,2})/(\\d{4})\\b").Substitute(s3,"$2-$1-$3");
00066    cout << s3 << endl;
00067 
00068    // Match Example :
00069    // The following example shows how to extract a protocol and port number from an URL string . 
00070    // Note again the parentheses in the regexp string : "(\\w+)" requires a non-empty
00071    // alphanumeric string while "(\\d+)" wants a pure digital string .
00072    // The matched substrings together with the full matched string are returned in a
00073    // TObjArray . The first entry is the full string while next entries are the substrings
00074    // in the order as listed in the regexp string .
00075    //
00076    // Note that there is also a Match(..) command that returns the positions of the
00077    // substrings in the input string .
00078 
00079    TString s4("http://fink.sourceforge.net:8080/index/readme.html");
00080    TObjArray *subStrL = TPRegexp("^(\\w+)://[^/]+:(\\d+)/$").MatchS(s4);
00081    const Int_t nrSubStr = subStrL->GetLast()+1;
00082    if (nrSubStr > 2) {
00083      const TString proto = ((TObjString *)subStrL->At(1))->GetString();
00084      const TString port  = ((TObjString *)subStrL->At(2))->GetString();
00085      cout << "protocol: " << proto << "  port: " << port << endl;
00086    }
00087 
00088    // Match Example :
00089    // This example returns kTRUE if the email address is valid . For that it has to fulfill the following
00090    // criteria:
00091    // 1) It should be of the form string1@string2 . The "^" and "$" ensure that we compare the complete
00092    //    email string
00093    // 2) ([\\w-\\.]+)  : 
00094    //    string1 is only allowed to be composed out of the alphanumeric characters, "-" and "." .
00095    //    The "+" ensures that string1 can not be empty .
00096    // 3) string2 is matched against three different parts :
00097    //    a. ((\\[[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.)|(([\\w-]+\\.)+))  :
00098    //       This regular expression ensures that EITHER the string starts with "[" followed by three groups
00099    //       of numbers, separated by "." , where each group has 1 to 3 numbers, OR alphanumeric strings,
00100    //       possibly containing "-" characters, seperated by "." .
00101    //    b. ([a-zA-Z]{2,4}|[0-9]{1,3})  :
00102    //       This part contains EITHER 2 to 4 alpha characters OR 1 to 3 numbers
00103    //    c. (\\]?)  :
00104    //       At most one "]" character .
00105 
00106    TString s5("fons.rademakers@cern.ch");
00107    TPRegexp r5("^([\\w-\\.]+)@((\\[[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.)|(([\\w-]+\\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\\]?)$");
00108    cout << "Check if the email address \"" << s5 << "\" is valid: " << (r5.MatchB(s5) ? "TRUE" : "FALSE") << endl;
00109 
00110    // Substitute Example with pattern modifier :
00111    // Like in Perl, Substitute/Match commands accept modifier arguments . For instance a "g" modifier causes to
00112    // match the regexp globally . In the example below, all words starting and ending with the character "n"
00113    // are replaced by the word neutrino .
00114 
00115    TString s6("neutron proton electron neutron");
00116    TPRegexp("(n\\w+n)").Substitute(s6,"neutrino","g");
00117    cout << s6 << endl;
00118 }

Generated on Tue Jul 5 15:44:53 2011 for ROOT_528-00b_version by  doxygen 1.5.1