pcredemo.c

Go to the documentation of this file.
00001 /*************************************************
00002 *           PCRE DEMONSTRATION PROGRAM           *
00003 *************************************************/
00004 
00005 /* This is a demonstration program to illustrate the most straightforward ways
00006 of calling the PCRE regular expression library from a C program. See the
00007 pcresample documentation for a short discussion ("man pcresample" if you have
00008 the PCRE man pages installed).
00009 
00010 In Unix-like environments, compile this program thuswise:
00011 
00012   gcc -Wall pcredemo.c -I/usr/local/include -L/usr/local/lib \
00013     -R/usr/local/lib -lpcre
00014 
00015 Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and
00016 library files for PCRE are installed on your system. You don't need -I and -L
00017 if PCRE is installed in the standard system libraries. Only some operating
00018 systems (e.g. Solaris) use the -R option.
00019 
00020 Building under Windows:
00021 
00022 If you want to statically link this program against a non-dll .a file, you must
00023 define PCRE_STATIC before including pcre.h, otherwise the pcre_malloc() and
00024 pcre_free() exported functions will be declared __declspec(dllimport), with
00025 unwanted results. So in this environment, uncomment the following line. */
00026 
00027 /* #define PCRE_STATIC */
00028 
00029 #include <stdio.h>
00030 #include <string.h>
00031 #include <pcre.h>
00032 
00033 #define OVECCOUNT 30    /* should be a multiple of 3 */
00034 
00035 
00036 int main(int argc, char **argv)
00037 {
00038 pcre *re;
00039 const char *error;
00040 char *pattern;
00041 char *subject;
00042 unsigned char *name_table;
00043 int erroffset;
00044 int find_all;
00045 int namecount;
00046 int name_entry_size;
00047 int ovector[OVECCOUNT];
00048 int subject_length;
00049 int rc, i;
00050 
00051 
00052 /**************************************************************************
00053 * First, sort out the command line. There is only one possible option at  *
00054 * the moment, "-g" to request repeated matching to find all occurrences,  *
00055 * like Perl's /g option. We set the variable find_all to a non-zero value *
00056 * if the -g option is present. Apart from that, there must be exactly two *
00057 * arguments.                                                              *
00058 **************************************************************************/
00059 
00060 find_all = 0;
00061 for (i = 1; i < argc; i++)
00062   {
00063   if (strcmp(argv[i], "-g") == 0) find_all = 1;
00064     else break;
00065   }
00066 
00067 /* After the options, we require exactly two arguments, which are the pattern,
00068 and the subject string. */
00069 
00070 if (argc - i != 2)
00071   {
00072   printf("Two arguments required: a regex and a subject string\n");
00073   return 1;
00074   }
00075 
00076 pattern = argv[i];
00077 subject = argv[i+1];
00078 subject_length = (int)strlen(subject);
00079 
00080 
00081 /*************************************************************************
00082 * Now we are going to compile the regular expression pattern, and handle *
00083 * and errors that are detected.                                          *
00084 *************************************************************************/
00085 
00086 re = pcre_compile(
00087   pattern,              /* the pattern */
00088   0,                    /* default options */
00089   &error,               /* for error message */
00090   &erroffset,           /* for error offset */
00091   NULL);                /* use default character tables */
00092 
00093 /* Compilation failed: print the error message and exit */
00094 
00095 if (re == NULL)
00096   {
00097   printf("PCRE compilation failed at offset %d: %s\n", erroffset, error);
00098   return 1;
00099   }
00100 
00101 
00102 /*************************************************************************
00103 * If the compilation succeeded, we call PCRE again, in order to do a     *
00104 * pattern match against the subject string. This does just ONE match. If *
00105 * further matching is needed, it will be done below.                     *
00106 *************************************************************************/
00107 
00108 rc = pcre_exec(
00109   re,                   /* the compiled pattern */
00110   NULL,                 /* no extra data - we didn't study the pattern */
00111   subject,              /* the subject string */
00112   subject_length,       /* the length of the subject */
00113   0,                    /* start at offset 0 in the subject */
00114   0,                    /* default options */
00115   ovector,              /* output vector for substring information */
00116   OVECCOUNT);           /* number of elements in the output vector */
00117 
00118 /* Matching failed: handle error cases */
00119 
00120 if (rc < 0)
00121   {
00122   switch(rc)
00123     {
00124     case PCRE_ERROR_NOMATCH: printf("No match\n"); break;
00125     /*
00126     Handle other special cases if you like
00127     */
00128     default: printf("Matching error %d\n", rc); break;
00129     }
00130   pcre_free(re);     /* Release memory used for the compiled pattern */
00131   return 1;
00132   }
00133 
00134 /* Match succeded */
00135 
00136 printf("\nMatch succeeded at offset %d\n", ovector[0]);
00137 
00138 
00139 /*************************************************************************
00140 * We have found the first match within the subject string. If the output *
00141 * vector wasn't big enough, say so. Then output any substrings that were *
00142 * captured.                                                              *
00143 *************************************************************************/
00144 
00145 /* The output vector wasn't big enough */
00146 
00147 if (rc == 0)
00148   {
00149   rc = OVECCOUNT/3;
00150   printf("ovector only has room for %d captured substrings\n", rc - 1);
00151   }
00152 
00153 /* Show substrings stored in the output vector by number. Obviously, in a real
00154 application you might want to do things other than print them. */
00155 
00156 for (i = 0; i < rc; i++)
00157   {
00158   char *substring_start = subject + ovector[2*i];
00159   int substring_length = ovector[2*i+1] - ovector[2*i];
00160   printf("%2d: %.*s\n", i, substring_length, substring_start);
00161   }
00162 
00163 
00164 /**************************************************************************
00165 * That concludes the basic part of this demonstration program. We have    *
00166 * compiled a pattern, and performed a single match. The code that follows *
00167 * shows first how to access named substrings, and then how to code for    *
00168 * repeated matches on the same subject.                                   *
00169 **************************************************************************/
00170 
00171 /* See if there are any named substrings, and if so, show them by name. First
00172 we have to extract the count of named parentheses from the pattern. */
00173 
00174 (void)pcre_fullinfo(
00175   re,                   /* the compiled pattern */
00176   NULL,                 /* no extra data - we didn't study the pattern */
00177   PCRE_INFO_NAMECOUNT,  /* number of named substrings */
00178   &namecount);          /* where to put the answer */
00179 
00180 if (namecount <= 0) printf("No named substrings\n"); else
00181   {
00182   unsigned char *tabptr;
00183   printf("Named substrings\n");
00184 
00185   /* Before we can access the substrings, we must extract the table for
00186   translating names to numbers, and the size of each entry in the table. */
00187 
00188   (void)pcre_fullinfo(
00189     re,                       /* the compiled pattern */
00190     NULL,                     /* no extra data - we didn't study the pattern */
00191     PCRE_INFO_NAMETABLE,      /* address of the table */
00192     &name_table);             /* where to put the answer */
00193 
00194   (void)pcre_fullinfo(
00195     re,                       /* the compiled pattern */
00196     NULL,                     /* no extra data - we didn't study the pattern */
00197     PCRE_INFO_NAMEENTRYSIZE,  /* size of each entry in the table */
00198     &name_entry_size);        /* where to put the answer */
00199 
00200   /* Now we can scan the table and, for each entry, print the number, the name,
00201   and the substring itself. */
00202 
00203   tabptr = name_table;
00204   for (i = 0; i < namecount; i++)
00205     {
00206     int n = (tabptr[0] << 8) | tabptr[1];
00207     printf("(%d) %*s: %.*s\n", n, name_entry_size - 3, tabptr + 2,
00208       ovector[2*n+1] - ovector[2*n], subject + ovector[2*n]);
00209     tabptr += name_entry_size;
00210     }
00211   }
00212 
00213 
00214 /*************************************************************************
00215 * If the "-g" option was given on the command line, we want to continue  *
00216 * to search for additional matches in the subject string, in a similar   *
00217 * way to the /g option in Perl. This turns out to be trickier than you   *
00218 * might think because of the possibility of matching an empty string.    *
00219 * What happens is as follows:                                            *
00220 *                                                                        *
00221 * If the previous match was NOT for an empty string, we can just start   *
00222 * the next match at the end of the previous one.                         *
00223 *                                                                        *
00224 * If the previous match WAS for an empty string, we can't do that, as it *
00225 * would lead to an infinite loop. Instead, a special call of pcre_exec() *
00226 * is made with the PCRE_NOTEMPTY and PCRE_ANCHORED flags set. The first  *
00227 * of these tells PCRE that an empty string is not a valid match; other   *
00228 * possibilities must be tried. The second flag restricts PCRE to one     *
00229 * match attempt at the initial string position. If this match succeeds,  *
00230 * an alternative to the empty string match has been found, and we can    *
00231 * proceed round the loop.                                                *
00232 *************************************************************************/
00233 
00234 if (!find_all)
00235   {
00236   pcre_free(re);   /* Release the memory used for the compiled pattern */
00237   return 0;        /* Finish unless -g was given */
00238   }
00239 
00240 /* Loop for second and subsequent matches */
00241 
00242 for (;;)
00243   {
00244   int options = 0;                 /* Normally no options */
00245   int start_offset = ovector[1];   /* Start at end of previous match */
00246 
00247   /* If the previous match was for an empty string, we are finished if we are
00248   at the end of the subject. Otherwise, arrange to run another match at the
00249   same point to see if a non-empty match can be found. */
00250 
00251   if (ovector[0] == ovector[1])
00252     {
00253     if (ovector[0] == subject_length) break;
00254     options = PCRE_NOTEMPTY | PCRE_ANCHORED;
00255     }
00256 
00257   /* Run the next matching operation */
00258 
00259   rc = pcre_exec(
00260     re,                   /* the compiled pattern */
00261     NULL,                 /* no extra data - we didn't study the pattern */
00262     subject,              /* the subject string */
00263     subject_length,       /* the length of the subject */
00264     start_offset,         /* starting offset in the subject */
00265     options,              /* options */
00266     ovector,              /* output vector for substring information */
00267     OVECCOUNT);           /* number of elements in the output vector */
00268 
00269   /* This time, a result of NOMATCH isn't an error. If the value in "options"
00270   is zero, it just means we have found all possible matches, so the loop ends.
00271   Otherwise, it means we have failed to find a non-empty-string match at a
00272   point where there was a previous empty-string match. In this case, we do what
00273   Perl does: advance the matching position by one, and continue. We do this by
00274   setting the "end of previous match" offset, because that is picked up at the
00275   top of the loop as the point at which to start again. */
00276 
00277   if (rc == PCRE_ERROR_NOMATCH)
00278     {
00279     if (options == 0) break;
00280     ovector[1] = start_offset + 1;
00281     continue;    /* Go round the loop again */
00282     }
00283 
00284   /* Other matching errors are not recoverable. */
00285 
00286   if (rc < 0)
00287     {
00288     printf("Matching error %d\n", rc);
00289     pcre_free(re);    /* Release memory used for the compiled pattern */
00290     return 1;
00291     }
00292 
00293   /* Match succeded */
00294 
00295   printf("\nMatch succeeded again at offset %d\n", ovector[0]);
00296 
00297   /* The match succeeded, but the output vector wasn't big enough. */
00298 
00299   if (rc == 0)
00300     {
00301     rc = OVECCOUNT/3;
00302     printf("ovector only has room for %d captured substrings\n", rc - 1);
00303     }
00304 
00305   /* As before, show substrings stored in the output vector by number, and then
00306   also any named substrings. */
00307 
00308   for (i = 0; i < rc; i++)
00309     {
00310     char *substring_start = subject + ovector[2*i];
00311     int substring_length = ovector[2*i+1] - ovector[2*i];
00312     printf("%2d: %.*s\n", i, substring_length, substring_start);
00313     }
00314 
00315   if (namecount <= 0) printf("No named substrings\n"); else
00316     {
00317     unsigned char *tabptr = name_table;
00318     printf("Named substrings\n");
00319     for (i = 0; i < namecount; i++)
00320       {
00321       int n = (tabptr[0] << 8) | tabptr[1];
00322       printf("(%d) %*s: %.*s\n", n, name_entry_size - 3, tabptr + 2,
00323         ovector[2*n+1] - ovector[2*n], subject + ovector[2*n]);
00324       tabptr += name_entry_size;
00325       }
00326     }
00327   }      /* End of loop to find second and subsequent matches */
00328 
00329 printf("\n");
00330 pcre_free(re);       /* Release memory used for the compiled pattern */
00331 return 0;
00332 }
00333 
00334 /* End of pcredemo.c */

Generated on Tue Jul 5 14:11:58 2011 for ROOT_528-00b_version by  doxygen 1.5.1