1 /*
2   tre-parse.c - Regexp parser
3 
4   This software is released under a BSD-style license.
5   See the file LICENSE for details and copyright.
6 
7 */
8 
9 /*
10   This parser is just a simple recursive descent parser for POSIX.2
11   regexps.  The parser supports both the obsolete default syntax and
12   the "extended" syntax, and some nonstandard extensions.
13 */
14 
15 
16 #ifdef HAVE_CONFIG_H
17 #include <config.h>
18 #endif /* HAVE_CONFIG_H */
19 #include <string.h>
20 #include <assert.h>
21 #include <limits.h>
22 
23 #include "xmalloc.h"
24 #include "tre-mem.h"
25 #include "tre-ast.h"
26 #include "tre-stack.h"
27 #include "tre-parse.h"
28 
29 
30 /* Characters with special meanings in regexp syntax. */
31 #define CHAR_PIPE      L'|'
32 #define CHAR_LPAREN    L'('
33 #define CHAR_RPAREN    L')'
34 #define CHAR_LBRACE    L'{'
35 #define CHAR_RBRACE    L'}'
36 #define CHAR_LBRACKET            L'['
37 #define CHAR_RBRACKET            L']'
38 #define CHAR_MINUS     L'-'
39 #define CHAR_STAR      L'*'
40 #define CHAR_QUESTIONMARK  L'?'
41 #define CHAR_PLUS      L'+'
42 #define CHAR_PERIOD    L'.'
43 #define CHAR_COLON     L':'
44 #define CHAR_EQUAL     L'='
45 #define CHAR_COMMA     L','
46 #define CHAR_CARET     L'^'
47 #define CHAR_DOLLAR    L'$'
48 #define CHAR_BACKSLASH           L'\\'
49 #define CHAR_HASH      L'#'
50 #define CHAR_TILDE     L'~'
51 
52 
53 /* Some macros for expanding \w, \s, etc. */
54 static const struct tre_macro_struct {
55   const char c;
56   const char *expansion;
57 } tre_macros[] =
58   { {'t', "\t"},       {'n', "\n"},                  {'r', "\r"},
59     {'f', "\f"},       {'a', "\a"},                  {'e', "\033"},
60     {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"},
61     {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"},   {'D', "[^[:digit:]]"},
62     { 0, NULL }
63   };
64 
65 
66 /* Expands a macro delimited by `regex' and `regex_end' to `buf', which
67    must have at least `len' items.  Sets buf[0] to zero if the there
68    is no match in `tre_macros'. */
69 static void
tre_expand_macro(const tre_char_t * regex,const tre_char_t * regex_end,tre_char_t * buf,size_t buf_len)70 tre_expand_macro(const tre_char_t *regex, const tre_char_t *regex_end,
71                      tre_char_t *buf, size_t buf_len)
72 {
73   int i;
74 
75   buf[0] = 0;
76   if (regex >= regex_end)
77     return;
78 
79   for (i = 0; tre_macros[i].expansion; i++)
80     {
81       if (tre_macros[i].c == *regex)
82           {
83             unsigned int j;
84             DPRINT(("Expanding macro '%c' => '%s'\n",
85                       tre_macros[i].c, tre_macros[i].expansion));
86             for (j = 0; tre_macros[i].expansion[j] && j < buf_len; j++)
87               buf[j] = tre_macros[i].expansion[j];
88             buf[j] = 0;
89             break;
90           }
91     }
92 }
93 
94 static reg_errcode_t
tre_new_item(tre_mem_t mem,int min,int max,int * i,int * max_i,tre_ast_node_t *** items)95 tre_new_item(tre_mem_t mem, int min, int max, int *i, int *max_i,
96            tre_ast_node_t ***items)
97 {
98   reg_errcode_t status;
99   tre_ast_node_t **array = *items;
100   /* Allocate more space if necessary. */
101   if (*i >= *max_i)
102     {
103       tre_ast_node_t **new_items;
104       DPRINT(("out of array space, i = %d\n", *i));
105       /* If the array is already 1024 items large, give up -- there's
106            probably an error in the regexp (e.g. not a '\0' terminated
107            string and missing ']') */
108       if (*max_i > 1024)
109           return REG_ESPACE;
110       *max_i *= 2;
111       new_items = xrealloc(array, sizeof(*array) * *max_i);
112       if (new_items == NULL)
113           return REG_ESPACE;
114       *items = array = new_items;
115     }
116   array[*i] = tre_ast_new_literal(mem, min, max, -1);
117   status = array[*i] == NULL ? REG_ESPACE : REG_OK;
118   (*i)++;
119   return status;
120 }
121 
122 
123 /* Expands a character class to character ranges. */
124 static reg_errcode_t
tre_expand_ctype(tre_mem_t mem,tre_ctype_t class,tre_ast_node_t *** items,int * i,int * max_i,int cflags)125 tre_expand_ctype(tre_mem_t mem, tre_ctype_t class, tre_ast_node_t ***items,
126                      int *i, int *max_i, int cflags)
127 {
128   reg_errcode_t status = REG_OK;
129   tre_cint_t c;
130   int j, min = -1, max = 0;
131   assert(TRE_MB_CUR_MAX == 1);
132 
133   DPRINT(("  expanding class to character ranges\n"));
134   for (j = 0; (j < 256) && (status == REG_OK); j++)
135     {
136       c = (tre_cint_t) j;
137       if (tre_isctype(c, class)
138             || ((cflags & REG_ICASE)
139                 && (tre_isctype(tre_tolower(c), class)
140                       || tre_isctype(tre_toupper(c), class))))
141 {
142             if (min < 0)
143               min = c;
144             max = c;
145           }
146       else if (min >= 0)
147           {
148             DPRINT(("  range %c (%d) to %c (%d)\n", min, min, max, max));
149             status = tre_new_item(mem, min, max, i, max_i, items);
150             min = -1;
151           }
152     }
153   if (min >= 0 && status == REG_OK)
154     status = tre_new_item(mem, min, max, i, max_i, items);
155   return status;
156 }
157 
158 
159 static int
tre_compare_items(const void * a,const void * b)160 tre_compare_items(const void *a, const void *b)
161 {
162   const tre_ast_node_t *node_a = *(tre_ast_node_t * const *)a;
163   const tre_ast_node_t *node_b = *(tre_ast_node_t * const *)b;
164   tre_literal_t *l_a = node_a->obj, *l_b = node_b->obj;
165   int a_min = l_a->code_min, b_min = l_b->code_min;
166 
167   if (a_min < b_min)
168     return -1;
169   else if (a_min > b_min)
170     return 1;
171   else
172     return 0;
173 }
174 
175 #ifndef TRE_USE_SYSTEM_WCTYPE
176 
177 int tre_isalnum_func(tre_cint_t);
178 int tre_isalpha_func(tre_cint_t);
179 int tre_isascii_func(tre_cint_t);
180 int tre_isblank_func(tre_cint_t);
181 int tre_iscntrl_func(tre_cint_t);
182 int tre_isdigit_func(tre_cint_t);
183 int tre_isgraph_func(tre_cint_t);
184 int tre_islower_func(tre_cint_t);
185 int tre_isprint_func(tre_cint_t);
186 int tre_ispunct_func(tre_cint_t);
187 int tre_isspace_func(tre_cint_t);
188 int tre_isupper_func(tre_cint_t);
189 int tre_isxdigit_func(tre_cint_t);
190 
191 /* isalnum() and the rest may be macros, so wrap them to functions. */
tre_isalnum_func(tre_cint_t c)192 int tre_isalnum_func(tre_cint_t c) { return tre_isalnum(c); }
tre_isalpha_func(tre_cint_t c)193 int tre_isalpha_func(tre_cint_t c) { return tre_isalpha(c); }
194 
195 #ifdef tre_isascii
tre_isascii_func(tre_cint_t c)196 int tre_isascii_func(tre_cint_t c) { return tre_isascii(c); }
197 #else /* !tre_isascii */
tre_isascii_func(tre_cint_t c)198 int tre_isascii_func(tre_cint_t c) { return !(c >> 7); }
199 #endif /* !tre_isascii */
200 
201 #ifdef tre_isblank
tre_isblank_func(tre_cint_t c)202 int tre_isblank_func(tre_cint_t c) { return tre_isblank(c); }
203 #else /* !tre_isblank */
tre_isblank_func(tre_cint_t c)204 int tre_isblank_func(tre_cint_t c) { return ((c == ' ') || (c == '\t')); }
205 #endif /* !tre_isblank */
206 
tre_iscntrl_func(tre_cint_t c)207 int tre_iscntrl_func(tre_cint_t c) { return tre_iscntrl(c); }
tre_isdigit_func(tre_cint_t c)208 int tre_isdigit_func(tre_cint_t c) { return tre_isdigit(c); }
tre_isgraph_func(tre_cint_t c)209 int tre_isgraph_func(tre_cint_t c) { return tre_isgraph(c); }
tre_islower_func(tre_cint_t c)210 int tre_islower_func(tre_cint_t c) { return tre_islower(c); }
tre_isprint_func(tre_cint_t c)211 int tre_isprint_func(tre_cint_t c) { return tre_isprint(c); }
tre_ispunct_func(tre_cint_t c)212 int tre_ispunct_func(tre_cint_t c) { return tre_ispunct(c); }
tre_isspace_func(tre_cint_t c)213 int tre_isspace_func(tre_cint_t c) { return tre_isspace(c); }
tre_isupper_func(tre_cint_t c)214 int tre_isupper_func(tre_cint_t c) { return tre_isupper(c); }
tre_isxdigit_func(tre_cint_t c)215 int tre_isxdigit_func(tre_cint_t c) { return tre_isxdigit(c); }
216 
217 struct {
218   const char *name;
219   int (*func)(tre_cint_t);
220 } tre_ctype_map[] = {
221   { "alnum", &tre_isalnum_func },
222   { "alpha", &tre_isalpha_func },
223 #ifdef tre_isascii
224   { "ascii", &tre_isascii_func },
225 #endif /* tre_isascii */
226 #ifdef tre_isblank
227   { "blank", &tre_isblank_func },
228 #endif /* tre_isblank */
229   { "cntrl", &tre_iscntrl_func },
230   { "digit", &tre_isdigit_func },
231   { "graph", &tre_isgraph_func },
232   { "lower", &tre_islower_func },
233   { "print", &tre_isprint_func },
234   { "punct", &tre_ispunct_func },
235   { "space", &tre_isspace_func },
236   { "upper", &tre_isupper_func },
237   { "xdigit", &tre_isxdigit_func },
238   { NULL, NULL}
239 };
240 
tre_ctype(const char * name)241 tre_ctype_t tre_ctype(const char *name)
242 {
243   int i;
244   for (i = 0; tre_ctype_map[i].name != NULL; i++)
245     {
246       if (strcmp(name, tre_ctype_map[i].name) == 0)
247           return tre_ctype_map[i].func;
248     }
249   return (tre_ctype_t)0;
250 }
251 #endif /* !TRE_USE_SYSTEM_WCTYPE */
252 
253 /* Maximum number of character classes that can occur in a negated bracket
254    expression.      */
255 #define MAX_NEG_CLASSES 64
256 
257 /* Maximum length of character class names. */
258 #define MAX_CLASS_NAME
259 
260 #define REST(re) (int)(ctx->re_end - (re)), (re)
261 
262 static reg_errcode_t
tre_parse_bracket_items(tre_parse_ctx_t * ctx,int negate,tre_ctype_t neg_classes[],int * num_neg_classes,tre_ast_node_t *** items,int * num_items,int * items_size)263 tre_parse_bracket_items(tre_parse_ctx_t *ctx, int negate,
264                               tre_ctype_t neg_classes[], int *num_neg_classes,
265                               tre_ast_node_t ***items, int *num_items,
266                               int *items_size)
267 {
268   const tre_char_t *re = ctx->re;
269   reg_errcode_t status = REG_OK;
270   tre_ctype_t class = (tre_ctype_t)0;
271   int i = *num_items;
272   int max_i = *items_size;
273   int skip;
274 
275   /* Build an array of the items in the bracket expression. */
276   while (status == REG_OK)
277     {
278       skip = 0;
279       if (re == ctx->re_end)
280           {
281             status = REG_EBRACK;
282           }
283       else if (*re == CHAR_RBRACKET && re > ctx->re)
284           {
285             DPRINT(("tre_parse_bracket: done: '%.*" STRF "'\n", REST(re)));
286             re++;
287             break;
288           }
289       else
290           {
291             tre_cint_t min = 0, max = 0;
292 
293             class = (tre_ctype_t)0;
294             if (re + 2 < ctx->re_end
295                 && *(re + 1) == CHAR_MINUS && *(re + 2) != CHAR_RBRACKET)
296               {
297                 DPRINT(("tre_parse_bracket:  range: '%.*" STRF "'\n", REST(re)));
298                 min = *re;
299                 max = *(re + 2);
300                 re += 3;
301                 /* XXX - Should use collation order instead of encoding values
302                      in character ranges. */
303                 if (min > max)
304                     status = REG_ERANGE;
305               }
306             else if (re + 1 < ctx->re_end
307                        && *re == CHAR_LBRACKET && *(re + 1) == CHAR_PERIOD)
308               status = REG_ECOLLATE;
309             else if (re + 1 < ctx->re_end
310                        && *re == CHAR_LBRACKET && *(re + 1) == CHAR_EQUAL)
311               status = REG_ECOLLATE;
312             else if (re + 1 < ctx->re_end
313                        && *re == CHAR_LBRACKET && *(re + 1) == CHAR_COLON)
314               {
315                 char tmp_str[64];
316                 const tre_char_t *endptr = re + 2;
317                 size_t len;
318                 DPRINT(("tre_parse_bracket:  class: '%.*" STRF "'\n", REST(re)));
319                 while (endptr < ctx->re_end && *endptr != CHAR_COLON)
320                     endptr++;
321                 if (endptr != ctx->re_end)
322                     {
323                       len = MIN(endptr - re - 2, 63);
324 #ifdef TRE_WCHAR
325                       {
326                         tre_char_t tmp_wcs[64];
327                         wcsncpy(tmp_wcs, re + 2, (size_t)len);
328                         tmp_wcs[len] = L'\0';
329 #if defined HAVE_WCSRTOMBS
330                         {
331                           mbstate_t state;
332                           const tre_char_t *src = tmp_wcs;
333                           memset(&state, '\0', sizeof(state));
334                           len = wcsrtombs(tmp_str, &src, sizeof(tmp_str), &state);
335                         }
336 #elif defined HAVE_WCSTOMBS
337                         len = wcstombs(tmp_str, tmp_wcs, 63);
338 #endif /* defined HAVE_WCSTOMBS */
339                       }
340 #else /* !TRE_WCHAR */
341                       /* LINTED */strncpy(tmp_str, (const char*)re + 2, len);
342 #endif /* !TRE_WCHAR */
343                       tmp_str[len] = '\0';
344                       DPRINT(("  class name: %s\n", tmp_str));
345                       class = tre_ctype(tmp_str);
346                       if (!class)
347                         status = REG_ECTYPE;
348                       /* Optimize character classes for 8 bit character sets. */
349                       if (status == REG_OK && ctx->cur_max == 1)
350                         {
351                           status = tre_expand_ctype(ctx->mem, class, items,
352                                                             &i, &max_i, ctx->cflags);
353                           class = (tre_ctype_t)0;
354                           skip = 1;
355                         }
356                       re = endptr + 2;
357                     }
358                 else
359                     status = REG_ECTYPE;
360                 min = 0;
361                 max = TRE_CHAR_MAX;
362               }
363             else
364               {
365                 DPRINT(("tre_parse_bracket:   char: '%.*" STRF "'\n", REST(re)));
366                 if (*re == CHAR_MINUS && *(re + 1) != CHAR_RBRACKET
367                       && ctx->re != re)
368                     /* Two ranges are not allowed to share and endpoint. */
369                     status = REG_ERANGE;
370                 min = max = *re++;
371               }
372 
373             if (status != REG_OK)
374               break;
375 
376             if (class && negate)
377               if (*num_neg_classes >= MAX_NEG_CLASSES)
378                 status = REG_ESPACE;
379               else
380                 neg_classes[(*num_neg_classes)++] = class;
381             else if (!skip)
382               {
383                 status = tre_new_item(ctx->mem, min, max, &i, &max_i, items);
384                 if (status != REG_OK)
385                     break;
386                 ((tre_literal_t*)((*items)[i-1])->obj)->u.class = class;
387               }
388 
389             /* Add opposite-case counterpoints if REG_ICASE is present.
390                This is broken if there are more than two "same" characters. */
391             if (ctx->cflags & REG_ICASE && !class && status == REG_OK && !skip)
392               {
393                 tre_cint_t cmin, ccurr;
394 
395                 DPRINT(("adding opposite-case counterpoints\n"));
396                 while (min <= max)
397                     {
398                       if (tre_islower(min))
399                         {
400                           cmin = ccurr = tre_toupper(min++);
401                           while (tre_islower(min) && tre_toupper(min) == ccurr + 1
402                                    && min <= max)
403                               ccurr = tre_toupper(min++);
404                           status = tre_new_item(ctx->mem, cmin, ccurr,
405                                                       &i, &max_i, items);
406                         }
407                       else if (tre_isupper(min))
408                         {
409                           cmin = ccurr = tre_tolower(min++);
410                           while (tre_isupper(min) && tre_tolower(min) == ccurr + 1
411                                    && min <= max)
412                               ccurr = tre_tolower(min++);
413                           status = tre_new_item(ctx->mem, cmin, ccurr,
414                                                       &i, &max_i, items);
415                         }
416                       else min++;
417                       if (status != REG_OK)
418                         break;
419                     }
420                 if (status != REG_OK)
421                     break;
422               }
423           }
424     }
425   *num_items = i;
426   *items_size = max_i;
427   ctx->re = re;
428   return status;
429 }
430 
431 static reg_errcode_t
tre_parse_bracket(tre_parse_ctx_t * ctx,tre_ast_node_t ** result)432 tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
433 {
434   tre_ast_node_t *node = NULL;
435   int negate = 0;
436   reg_errcode_t status = REG_OK;
437   tre_ast_node_t **items, *u, *n;
438   int i = 0, j, max_i = 32, curr_max, curr_min;
439   tre_ctype_t neg_classes[MAX_NEG_CLASSES];
440   int num_neg_classes = 0;
441 
442   /* Start off with an array of `max_i' elements. */
443   items = xmalloc(sizeof(*items) * max_i);
444   if (items == NULL)
445     return REG_ESPACE;
446 
447   if (*ctx->re == CHAR_CARET)
448     {
449       DPRINT(("tre_parse_bracket: negate: '%.*" STRF "'\n", REST(ctx->re)));
450       negate = 1;
451       ctx->re++;
452     }
453 
454   status = tre_parse_bracket_items(ctx, negate, neg_classes, &num_neg_classes,
455                                            &items, &i, &max_i);
456 
457   if (status != REG_OK)
458     goto parse_bracket_done;
459 
460   /* Sort the array if we need to negate it. */
461   if (negate)
462     qsort(items, (unsigned)i, sizeof(*items), tre_compare_items);
463 
464   curr_max = curr_min = 0;
465   /* Build a union of the items in the array, negated if necessary. */
466   for (j = 0; j < i && status == REG_OK; j++)
467     {
468       int min, max;
469       tre_literal_t *l = items[j]->obj;
470       min = l->code_min;
471       max = l->code_max;
472 
473       DPRINT(("item: %d - %d, class %ld, curr_max = %d\n",
474                 (int)l->code_min, (int)l->code_max, (long)l->u.class, curr_max));
475 
476       if (negate)
477           {
478             if (min < curr_max)
479               {
480                 /* Overlap. */
481                 curr_max = MAX(max + 1, curr_max);
482                 DPRINT(("overlap, curr_max = %d\n", curr_max));
483                 l = NULL;
484               }
485             else
486               {
487                 /* No overlap. */
488                 curr_max = min - 1;
489                 if (curr_max >= curr_min)
490                     {
491                       DPRINT(("no overlap\n"));
492                       l->code_min = curr_min;
493                       l->code_max = curr_max;
494                     }
495                 else
496                     {
497                       DPRINT(("no overlap, zero room\n"));
498                       l = NULL;
499                     }
500                 curr_min = curr_max = max + 1;
501               }
502           }
503 
504       if (l != NULL)
505           {
506             int k;
507             DPRINT(("creating %d - %d\n", (int)l->code_min, (int)l->code_max));
508             l->position = ctx->position;
509             if (num_neg_classes > 0)
510               {
511                 l->neg_classes = tre_mem_alloc(ctx->mem,
512                                                        (sizeof(*l->neg_classes)
513                                                         * (num_neg_classes + 1)));
514                 if (l->neg_classes == NULL)
515                     {
516                       status = REG_ESPACE;
517                       break;
518                     }
519                 for (k = 0; k < num_neg_classes; k++)
520                     l->neg_classes[k] = neg_classes[k];
521                 l->neg_classes[k] = (tre_ctype_t)0;
522               }
523             else
524               l->neg_classes = NULL;
525             if (node == NULL)
526               node = items[j];
527             else
528               {
529                 u = tre_ast_new_union(ctx->mem, node, items[j]);
530                 if (u == NULL)
531                     status = REG_ESPACE;
532                 node = u;
533               }
534           }
535     }
536 
537   if (status != REG_OK)
538     goto parse_bracket_done;
539 
540   if (negate)
541     {
542       int k;
543       DPRINT(("final: creating %d - %d\n", curr_min, (int)TRE_CHAR_MAX));
544       n = tre_ast_new_literal(ctx->mem, curr_min, TRE_CHAR_MAX, ctx->position);
545       if (n == NULL)
546           status = REG_ESPACE;
547       else
548           {
549             tre_literal_t *l = n->obj;
550             if (num_neg_classes > 0)
551               {
552                 l->neg_classes = tre_mem_alloc(ctx->mem,
553                                                        (sizeof(*l->neg_classes)
554                                                         * (num_neg_classes + 1)));
555                 if (l->neg_classes == NULL)
556                     {
557                       status = REG_ESPACE;
558                       goto parse_bracket_done;
559                     }
560                 for (k = 0; k < num_neg_classes; k++)
561                     l->neg_classes[k] = neg_classes[k];
562                 l->neg_classes[k] = (tre_ctype_t)0;
563               }
564             else
565               l->neg_classes = NULL;
566             if (node == NULL)
567               node = n;
568             else
569               {
570                 u = tre_ast_new_union(ctx->mem, node, n);
571                 if (u == NULL)
572                     status = REG_ESPACE;
573                 node = u;
574               }
575           }
576     }
577 
578   if (status != REG_OK)
579     goto parse_bracket_done;
580 
581 #ifdef TRE_DEBUG
582   tre_ast_print(node);
583 #endif /* TRE_DEBUG */
584 
585  parse_bracket_done:
586   xfree(items);
587   ctx->position++;
588   *result = node;
589   return status;
590 }
591 
592 
593 /* Parses a positive decimal integer.  Returns -1 if the string does not
594    contain a valid number. */
595 static int
tre_parse_int(const tre_char_t ** regex,const tre_char_t * regex_end)596 tre_parse_int(const tre_char_t **regex, const tre_char_t *regex_end)
597 {
598   int num = -1;
599   const tre_char_t *r = *regex;
600   while (r < regex_end && *r >= L'0' && *r <= L'9')
601     {
602       if (num < 0)
603           num = 0;
604       num = num * 10 + *r - L'0';
605       r++;
606     }
607   *regex = r;
608   return num;
609 }
610 
611 
612 static reg_errcode_t
tre_parse_bound(tre_parse_ctx_t * ctx,tre_ast_node_t ** result)613 tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
614 {
615   int min, max, i;
616   int cost_ins, cost_del, cost_subst, cost_max;
617   int limit_ins, limit_del, limit_subst, limit_err;
618   const tre_char_t *r = ctx->re;
619   const tre_char_t *start;
620   int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0;
621   int approx = 0;
622   int costs_set = 0;
623   int counts_set = 0;
624 
625   cost_ins = cost_del = cost_subst = cost_max = TRE_PARAM_UNSET;
626   limit_ins = limit_del = limit_subst = limit_err = TRE_PARAM_UNSET;
627 
628   /* Parse number (minimum repetition count). */
629   min = -1;
630   if (r < ctx->re_end && *r >= L'0' && *r <= L'9') {
631     DPRINT(("tre_parse:         min count: '%.*" STRF "'\n", REST(r)));
632     min = tre_parse_int(&r, ctx->re_end);
633   }
634 
635   /* Parse comma and second number (maximum repetition count). */
636   max = min;
637   if (r < ctx->re_end && *r == CHAR_COMMA)
638     {
639       r++;
640       DPRINT(("tre_parse:   max count: '%.*" STRF "'\n", REST(r)));
641       max = tre_parse_int(&r, ctx->re_end);
642     }
643 
644   /* Check that the repeat counts are sane. */
645   if ((max >= 0 && min > max) || max > RE_DUP_MAX)
646     return REG_BADBR;
647 
648 
649   /*
650    '{'
651      optionally followed immediately by a number == minimum repcount
652      optionally followed by , then a number == maximum repcount
653       + then a number == maximum insertion count
654       - then a number == maximum deletion count
655       # then a number == maximum substitution count
656       ~ then a number == maximum number of errors
657       Any of +, -, # or ~ without followed by a number means that
658       the maximum count/number of errors is infinite.
659 
660       An equation of the form
661           Xi + Yd + Zs < C
662       can be specified to set costs and the cost limit to a value
663       different from the default value:
664           - X is the cost of an insertion
665           - Y is the cost of a deletion
666           - Z is the cost of a substitution
667           - C is the maximum cost
668 
669       If no count limit or cost is set for an operation, the operation
670       is not allowed at all.
671   */
672 
673 
674   do {
675     int done;
676     start = r;
677 
678     /* Parse count limit settings */
679     done = 0;
680     if (!counts_set)
681       while (r + 1 < ctx->re_end && !done)
682           {
683             switch (*r)
684               {
685               case CHAR_PLUS:  /* Insert limit */
686                 DPRINT(("tre_parse:   ins limit: '%.*" STRF "'\n", REST(r)));
687                 r++;
688                 limit_ins = tre_parse_int(&r, ctx->re_end);
689                 if (limit_ins < 0)
690                     limit_ins = INT_MAX;
691                 counts_set = 1;
692                 break;
693               case CHAR_MINUS: /* Delete limit */
694                 DPRINT(("tre_parse:   del limit: '%.*" STRF "'\n", REST(r)));
695                 r++;
696                 limit_del = tre_parse_int(&r, ctx->re_end);
697                 if (limit_del < 0)
698                     limit_del = INT_MAX;
699                 counts_set = 1;
700                 break;
701               case CHAR_HASH:  /* Substitute limit */
702                 DPRINT(("tre_parse: subst limit: '%.*" STRF "'\n", REST(r)));
703                 r++;
704                 limit_subst = tre_parse_int(&r, ctx->re_end);
705                 if (limit_subst < 0)
706                     limit_subst = INT_MAX;
707                 counts_set = 1;
708                 break;
709               case CHAR_TILDE: /* Maximum number of changes */
710                 DPRINT(("tre_parse: count limit: '%.*" STRF "'\n", REST(r)));
711                 r++;
712                 limit_err = tre_parse_int(&r, ctx->re_end);
713                 if (limit_err < 0)
714                     limit_err = INT_MAX;
715                 approx = 1;
716                 break;
717               case CHAR_COMMA:
718                 r++;
719                 break;
720               case L' ':
721                 r++;
722                 break;
723               case L'}':
724                 done = 1;
725                 break;
726               default:
727                 done = 1;
728                 break;
729               }
730           }
731 
732     /* Parse cost restriction equation. */
733     done = 0;
734     if (!costs_set)
735       while (r + 1 < ctx->re_end && !done)
736           {
737             switch (*r)
738               {
739               case CHAR_PLUS:
740               case L' ':
741                 r++;
742                 break;
743               case L'<':
744                 DPRINT(("tre_parse:    max cost: '%.*" STRF "'\n", REST(r)));
745                 r++;
746                 while (*r == L' ')
747                     r++;
748                 cost_max = tre_parse_int(&r, ctx->re_end);
749                 if (cost_max < 0)
750                     cost_max = INT_MAX;
751                 else
752                     cost_max--;
753                 approx = 1;
754                 break;
755               case CHAR_COMMA:
756                 r++;
757                 done = 1;
758                 break;
759               default:
760                 if (*r >= L'0' && *r <= L'9')
761                     {
762 #ifdef TRE_DEBUG
763                       const tre_char_t *sr = r;
764 #endif /* TRE_DEBUG */
765                       int cost = tre_parse_int(&r, ctx->re_end);
766                       /* XXX - make sure r is not past end. */
767                       switch (*r)
768                         {
769                         case L'i':      /* Insert cost */
770                           DPRINT(("tre_parse:    ins cost: '%.*" STRF "'\n",
771                                     REST(sr)));
772                           r++;
773                           cost_ins = cost;
774                           costs_set = 1;
775                           break;
776                         case L'd':      /* Delete cost */
777                           DPRINT(("tre_parse:    del cost: '%.*" STRF "'\n",
778                                     REST(sr)));
779                           r++;
780                           cost_del = cost;
781                           costs_set = 1;
782                           break;
783                         case L's':      /* Substitute cost */
784                           DPRINT(("tre_parse:  subst cost: '%.*" STRF "'\n",
785                                     REST(sr)));
786                           r++;
787                           cost_subst = cost;
788                           costs_set = 1;
789                           break;
790                         default:
791                           return REG_BADBR;
792                         }
793                     }
794                 else
795                     {
796                       done = 1;
797                       break;
798                     }
799               }
800           }
801   } while (start != r);
802 
803   /* Missing }. */
804   if (r >= ctx->re_end)
805     return REG_EBRACE;
806 
807   /* Empty contents of {}. */
808   if (r == ctx->re)
809     return REG_BADBR;
810 
811   /* Parse the ending '}' or '\}'.*/
812   if (ctx->cflags & REG_EXTENDED)
813     {
814       if (r >= ctx->re_end || *r != CHAR_RBRACE)
815           return REG_BADBR;
816       r++;
817     }
818   else
819     {
820       if (r + 1 >= ctx->re_end
821             || *r != CHAR_BACKSLASH
822             || *(r + 1) != CHAR_RBRACE)
823           return REG_BADBR;
824       r += 2;
825     }
826 
827 
828   /* Parse trailing '?' marking minimal repetition. */
829   if (r < ctx->re_end)
830     {
831       if (*r == CHAR_QUESTIONMARK)
832           {
833             minimal = !(ctx->cflags & REG_UNGREEDY);
834             r++;
835           }
836       else if (*r == CHAR_STAR || *r == CHAR_PLUS)
837           {
838             /* These are reserved for future extensions. */
839             return REG_BADRPT;
840           }
841     }
842 
843   /* Create the AST node(s). */
844   if (min == 0 && max == 0)
845     {
846       *result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
847       if (*result == NULL)
848           return REG_ESPACE;
849     }
850   else
851     {
852       if (min < 0 && max < 0)
853           /* Only approximate parameters set, no repetitions. */
854           min = max = 1;
855 
856       *result = tre_ast_new_iter(ctx->mem, *result, min, max, minimal);
857       if (!*result)
858           return REG_ESPACE;
859 
860       /* If approximate matching parameters are set, add them to the
861            iteration node. */
862       if (approx || costs_set || counts_set)
863           {
864             int *params;
865             tre_iteration_t *iter = (*result)->obj;
866 
867             if (costs_set || counts_set)
868               {
869                 if (limit_ins == TRE_PARAM_UNSET)
870                     {
871                       if (cost_ins == TRE_PARAM_UNSET)
872                         limit_ins = 0;
873                       else
874                         limit_ins = INT_MAX;
875                     }
876 
877                 if (limit_del == TRE_PARAM_UNSET)
878                     {
879                       if (cost_del == TRE_PARAM_UNSET)
880                         limit_del = 0;
881                       else
882                         limit_del = INT_MAX;
883                     }
884 
885                 if (limit_subst == TRE_PARAM_UNSET)
886                     {
887                       if (cost_subst == TRE_PARAM_UNSET)
888                         limit_subst = 0;
889                       else
890                         limit_subst = INT_MAX;
891                     }
892               }
893 
894             if (cost_max == TRE_PARAM_UNSET)
895               cost_max = INT_MAX;
896             if (limit_err == TRE_PARAM_UNSET)
897               limit_err = INT_MAX;
898 
899             ctx->have_approx = 1;
900             params = tre_mem_alloc(ctx->mem, sizeof(*params) * TRE_PARAM_LAST);
901             if (!params)
902               return REG_ESPACE;
903             for (i = 0; i < TRE_PARAM_LAST; i++)
904               params[i] = TRE_PARAM_UNSET;
905             params[TRE_PARAM_COST_INS] = cost_ins;
906             params[TRE_PARAM_COST_DEL] = cost_del;
907             params[TRE_PARAM_COST_SUBST] = cost_subst;
908             params[TRE_PARAM_COST_MAX] = cost_max;
909             params[TRE_PARAM_MAX_INS] = limit_ins;
910             params[TRE_PARAM_MAX_DEL] = limit_del;
911             params[TRE_PARAM_MAX_SUBST] = limit_subst;
912             params[TRE_PARAM_MAX_ERR] = limit_err;
913             iter->params = params;
914           }
915     }
916 
917   DPRINT(("tre_parse_bound: min %d, max %d, costs [%d,%d,%d, total %d], "
918             "limits [%d,%d,%d, total %d]\n",
919             min, max, cost_ins, cost_del, cost_subst, cost_max,
920             limit_ins, limit_del, limit_subst, limit_err));
921 
922 
923   ctx->re = r;
924   return REG_OK;
925 }
926 
927 typedef enum {
928   PARSE_RE = 0,
929   PARSE_ATOM,
930   PARSE_MARK_FOR_SUBMATCH,
931   PARSE_BRANCH,
932   PARSE_PIECE,
933   PARSE_CATENATION,
934   PARSE_POST_CATENATION,
935   PARSE_UNION,
936   PARSE_POST_UNION,
937   PARSE_POSTFIX,
938   PARSE_RESTORE_CFLAGS
939 } tre_parse_re_stack_symbol_t;
940 
941 
942 reg_errcode_t
tre_parse(tre_parse_ctx_t * ctx)943 tre_parse(tre_parse_ctx_t *ctx)
944 {
945   tre_ast_node_t *result = NULL;
946   tre_parse_re_stack_symbol_t symbol;
947   reg_errcode_t status = REG_OK;
948   tre_stack_t *stack = ctx->stack;
949   int bottom = tre_stack_num_objects(stack);
950   int depth = 0;
951   int temporary_cflags = 0;
952 
953   DPRINT(("tre_parse: parsing '%.*" STRF "', len = %d\n",
954             ctx->len, ctx->re, ctx->len));
955 
956   if (!ctx->nofirstsub)
957     {
958       STACK_PUSH(stack, long, ctx->submatch_id);
959       STACK_PUSH(stack, long, PARSE_MARK_FOR_SUBMATCH);
960       ctx->submatch_id++;
961     }
962   STACK_PUSH(stack, long, PARSE_RE);
963   ctx->re_start = ctx->re;
964   ctx->re_end = ctx->re + ctx->len;
965 
966 
967   /* The following is basically just a recursive descent parser.  I use
968      an explicit stack instead of recursive functions mostly because of
969      two reasons: compatibility with systems which have an overflowable
970      call stack, and efficiency (both in lines of code and speed).  */
971   while (tre_stack_num_objects(stack) > bottom && status == REG_OK)
972     {
973       if (status != REG_OK)
974           break;
975       symbol = tre_stack_pop_int(stack);
976       switch (symbol)
977           {
978           case PARSE_RE:
979             /* Parse a full regexp.  A regexp is one or more branches,
980                separated by the union operator `|'. */
981 #ifdef REG_LITERAL
982             if (!(ctx->cflags & REG_LITERAL)
983                 && ctx->cflags & REG_EXTENDED)
984 #endif /* REG_LITERAL */
985               STACK_PUSHX(stack, long, PARSE_UNION);
986             STACK_PUSHX(stack, long, PARSE_BRANCH);
987             break;
988 
989           case PARSE_BRANCH:
990             /* Parse a branch.  A branch is one or more pieces, concatenated.
991                A piece is an atom possibly followed by a postfix operator. */
992             STACK_PUSHX(stack, long, PARSE_CATENATION);
993             STACK_PUSHX(stack, long, PARSE_PIECE);
994             break;
995 
996           case PARSE_PIECE:
997             /* Parse a piece.  A piece is an atom possibly followed by one
998                or more postfix operators. */
999 #ifdef REG_LITERAL
1000             if (!(ctx->cflags & REG_LITERAL))
1001 #endif /* REG_LITERAL */
1002               STACK_PUSHX(stack, long, PARSE_POSTFIX);
1003             STACK_PUSHX(stack, long, PARSE_ATOM);
1004             break;
1005 
1006           case PARSE_CATENATION:
1007             /* If the expression has not ended, parse another piece. */
1008             {
1009               tre_char_t c;
1010               if (ctx->re >= ctx->re_end)
1011                 break;
1012               c = *ctx->re;
1013 #ifdef REG_LITERAL
1014               if (!(ctx->cflags & REG_LITERAL))
1015                 {
1016 #endif /* REG_LITERAL */
1017                     if (ctx->cflags & REG_EXTENDED && c == CHAR_PIPE)
1018                       break;
1019                     if ((ctx->cflags & REG_EXTENDED
1020                          && c == CHAR_RPAREN && depth > 0)
1021                         || (!(ctx->cflags & REG_EXTENDED)
1022                               && (c == CHAR_BACKSLASH
1023                                   && *(ctx->re + 1) == CHAR_RPAREN)))
1024                       {
1025                         if (!(ctx->cflags & REG_EXTENDED) && depth == 0)
1026                           status = REG_EPAREN;
1027                         DPRINT(("tre_parse:         group end: '%.*" STRF "'\n",
1028                                   REST(ctx->re)));
1029                         depth--;
1030                         if (!(ctx->cflags & REG_EXTENDED))
1031                           ctx->re += 2;
1032                         break;
1033                       }
1034 #ifdef REG_LITERAL
1035                 }
1036 #endif /* REG_LITERAL */
1037 
1038 #ifdef REG_RIGHT_ASSOC
1039               if (ctx->cflags & REG_RIGHT_ASSOC)
1040                 {
1041                     /* Right associative concatenation. */
1042                     STACK_PUSHX(stack, voidptr, result);
1043                     STACK_PUSHX(stack, long, PARSE_POST_CATENATION);
1044                     STACK_PUSHX(stack, long, PARSE_CATENATION);
1045                     STACK_PUSHX(stack, long, PARSE_PIECE);
1046                 }
1047               else
1048 #endif /* REG_RIGHT_ASSOC */
1049                 {
1050                     /* Default case, left associative concatenation. */
1051                     STACK_PUSHX(stack, long, PARSE_CATENATION);
1052                     STACK_PUSHX(stack, voidptr, result);
1053                     STACK_PUSHX(stack, long, PARSE_POST_CATENATION);
1054                     STACK_PUSHX(stack, long, PARSE_PIECE);
1055                 }
1056               break;
1057             }
1058 
1059           case PARSE_POST_CATENATION:
1060             {
1061               tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
1062               tre_ast_node_t *tmp_node;
1063               tmp_node = tre_ast_new_catenation(ctx->mem, tree, result);
1064               if (!tmp_node)
1065                 return REG_ESPACE;
1066               result = tmp_node;
1067               break;
1068             }
1069 
1070           case PARSE_UNION:
1071             if (ctx->re >= ctx->re_end)
1072               break;
1073 #ifdef REG_LITERAL
1074             if (ctx->cflags & REG_LITERAL)
1075               break;
1076 #endif /* REG_LITERAL */
1077             switch (*ctx->re)
1078               {
1079               case CHAR_PIPE:
1080                 DPRINT(("tre_parse:     union: '%.*" STRF "'\n",
1081                           REST(ctx->re)));
1082                 STACK_PUSHX(stack, long, PARSE_UNION);
1083                 STACK_PUSHX(stack, voidptr, result);
1084                 STACK_PUSHX(stack, long, PARSE_POST_UNION);
1085                 STACK_PUSHX(stack, long, PARSE_BRANCH);
1086                 ctx->re++;
1087                 break;
1088 
1089               case CHAR_RPAREN:
1090                 ctx->re++;
1091                 break;
1092 
1093               default:
1094                 break;
1095               }
1096             break;
1097 
1098           case PARSE_POST_UNION:
1099             {
1100               tre_ast_node_t *tmp_node;
1101               tre_ast_node_t *tree = tre_stack_pop_voidptr(stack);
1102               tmp_node = tre_ast_new_union(ctx->mem, tree, result);
1103               if (!tmp_node)
1104                 return REG_ESPACE;
1105               result = tmp_node;
1106               break;
1107             }
1108 
1109           case PARSE_POSTFIX:
1110             /* Parse postfix operators. */
1111             if (ctx->re >= ctx->re_end)
1112               break;
1113 #ifdef REG_LITERAL
1114             if (ctx->cflags & REG_LITERAL)
1115               break;
1116 #endif /* REG_LITERAL */
1117             switch (*ctx->re)
1118               {
1119               case CHAR_PLUS:
1120               case CHAR_QUESTIONMARK:
1121                 if (!(ctx->cflags & REG_EXTENDED))
1122                     break;
1123                     /*FALLTHROUGH*/
1124               case CHAR_STAR:
1125                 {
1126                     tre_ast_node_t *tmp_node;
1127                     int minimal = (ctx->cflags & REG_UNGREEDY) ? 1 : 0;
1128                     int rep_min = 0;
1129                     int rep_max = -1;
1130 #ifdef TRE_DEBUG
1131                     const tre_char_t *tmp_re;
1132 #endif
1133 
1134                     if (*ctx->re == CHAR_PLUS)
1135                       rep_min = 1;
1136                     if (*ctx->re == CHAR_QUESTIONMARK)
1137                       rep_max = 1;
1138 #ifdef TRE_DEBUG
1139                     tmp_re = ctx->re;
1140 #endif
1141 
1142                     if (ctx->re + 1 < ctx->re_end)
1143                       {
1144                         if (*(ctx->re + 1) == CHAR_QUESTIONMARK)
1145                           {
1146                               minimal = !(ctx->cflags & REG_UNGREEDY);
1147                               ctx->re++;
1148                           }
1149                         else if (*(ctx->re + 1) == CHAR_STAR
1150                                    || *(ctx->re + 1) == CHAR_PLUS)
1151                           {
1152                               /* These are reserved for future extensions. */
1153                               return REG_BADRPT;
1154                           }
1155                       }
1156 
1157                     DPRINT(("tre_parse: %s star: '%.*" STRF "'\n",
1158                               minimal ? "  minimal" : "greedy", REST(tmp_re)));
1159                     ctx->re++;
1160                     tmp_node = tre_ast_new_iter(ctx->mem, result, rep_min, rep_max,
1161                                                       minimal);
1162                     if (tmp_node == NULL)
1163                       return REG_ESPACE;
1164                     result = tmp_node;
1165                     STACK_PUSHX(stack, long, PARSE_POSTFIX);
1166                 }
1167                 break;
1168 
1169               case CHAR_BACKSLASH:
1170                 /* "\{" is special without REG_EXTENDED */
1171                 if (!(ctx->cflags & REG_EXTENDED)
1172                       && ctx->re + 1 < ctx->re_end
1173                       && *(ctx->re + 1) == CHAR_LBRACE)
1174                     {
1175                       ctx->re++;
1176                       goto parse_brace;
1177                     }
1178                 else
1179                     break;
1180 
1181               case CHAR_LBRACE:
1182                 /* "{" is literal without REG_EXTENDED */
1183                 if (!(ctx->cflags & REG_EXTENDED))
1184                     break;
1185 
1186               parse_brace:
1187                 DPRINT(("tre_parse:     bound: '%.*" STRF "'\n",
1188                           REST(ctx->re)));
1189                 ctx->re++;
1190 
1191                 status = tre_parse_bound(ctx, &result);
1192                 if (status != REG_OK)
1193                     return status;
1194                 STACK_PUSHX(stack, long, PARSE_POSTFIX);
1195                 break;
1196               }
1197             break;
1198 
1199           case PARSE_ATOM:
1200             /* Parse an atom.  An atom is a regular expression enclosed in `()',
1201                an empty set of `()', a bracket expression, `.', `^', `$',
1202                a `\' followed by a character, or a single character. */
1203 
1204             /* End of regexp? (empty string). */
1205             if (ctx->re >= ctx->re_end)
1206               goto parse_literal;
1207 
1208 #ifdef REG_LITERAL
1209             if (ctx->cflags & REG_LITERAL)
1210               goto parse_literal;
1211 #endif /* REG_LITERAL */
1212 
1213             switch (*ctx->re)
1214               {
1215               case CHAR_LPAREN:  /* parenthesized subexpression */
1216 
1217                 /* Handle "(?...)" extensions.  They work in a way similar
1218                      to Perls corresponding extensions. */
1219                 if (ctx->cflags & REG_EXTENDED
1220                       && *(ctx->re + 1) == CHAR_QUESTIONMARK)
1221                     {
1222                       int new_cflags = ctx->cflags;
1223                       int bit = 1;
1224                       DPRINT(("tre_parse:         extension: '%.*" STRF "\n",
1225                                 REST(ctx->re)));
1226                       ctx->re += 2;
1227                       while (/*CONSTCOND*/(void)1,1)
1228                         {
1229                           if (*ctx->re == L'i')
1230                               {
1231                                 DPRINT(("tre_parse:             icase: '%.*" STRF "\n",
1232                                           REST(ctx->re)));
1233                                 if (bit)
1234                                   new_cflags |= REG_ICASE;
1235                                 else
1236                                   new_cflags &= ~REG_ICASE;
1237                                 ctx->re++;
1238                               }
1239                           else if (*ctx->re == L'n')
1240                               {
1241                                 DPRINT(("tre_parse:           newline: '%.*" STRF "\n",
1242                                           REST(ctx->re)));
1243                                 if (bit)
1244                                   new_cflags |= REG_NEWLINE;
1245                                 else
1246                                   new_cflags &= ~REG_NEWLINE;
1247                                 ctx->re++;
1248                               }
1249 #ifdef REG_RIGHT_ASSOC
1250                           else if (*ctx->re == L'r')
1251                               {
1252                                 DPRINT(("tre_parse: right assoc: '%.*" STRF "\n",
1253                                           REST(ctx->re)));
1254                                 if (bit)
1255                                   new_cflags |= REG_RIGHT_ASSOC;
1256                                 else
1257                                   new_cflags &= ~REG_RIGHT_ASSOC;
1258                                 ctx->re++;
1259                               }
1260 #endif /* REG_RIGHT_ASSOC */
1261 #ifdef REG_UNGREEDY
1262                           else if (*ctx->re == L'U')
1263                               {
1264                                 DPRINT(("tre_parse:    ungreedy: '%.*" STRF "\n",
1265                                           REST(ctx->re)));
1266                                 if (bit)
1267                                   new_cflags |= REG_UNGREEDY;
1268                                 else
1269                                   new_cflags &= ~REG_UNGREEDY;
1270                                 ctx->re++;
1271                               }
1272 #endif /* REG_UNGREEDY */
1273                           else if (*ctx->re == CHAR_MINUS)
1274                               {
1275                                 DPRINT(("tre_parse:          turn off: '%.*" STRF "\n",
1276                                           REST(ctx->re)));
1277                                 ctx->re++;
1278                                 bit = 0;
1279                               }
1280                           else if (*ctx->re == CHAR_COLON)
1281                               {
1282                                 DPRINT(("tre_parse:          no group: '%.*" STRF "\n",
1283                                           REST(ctx->re)));
1284                                 ctx->re++;
1285                                 depth++;
1286                                 break;
1287                               }
1288                           else if (*ctx->re == CHAR_HASH)
1289                               {
1290                                 DPRINT(("tre_parse:    comment: '%.*" STRF "\n",
1291                                           REST(ctx->re)));
1292                                 /* A comment can contain any character except a
1293                                    right parenthesis */
1294                                 while (*ctx->re != CHAR_RPAREN
1295                                          && ctx->re < ctx->re_end)
1296                                   ctx->re++;
1297                                 if (*ctx->re == CHAR_RPAREN && ctx->re < ctx->re_end)
1298                                   {
1299                                     ctx->re++;
1300                                     break;
1301                                   }
1302                                 else
1303                                   return REG_BADPAT;
1304                               }
1305                           else if (*ctx->re == CHAR_RPAREN)
1306                               {
1307                                 ctx->re++;
1308                                 break;
1309                               }
1310                           else
1311                               return REG_BADPAT;
1312                         }
1313 
1314                       /* Turn on the cflags changes for the rest of the
1315                          enclosing group. */
1316                       STACK_PUSHX(stack, long, ctx->cflags);
1317                       STACK_PUSHX(stack, long, PARSE_RESTORE_CFLAGS);
1318                       STACK_PUSHX(stack, long, PARSE_RE);
1319                       ctx->cflags = new_cflags;
1320                       break;
1321                     }
1322 
1323                 if (ctx->cflags & REG_EXTENDED
1324                       || (ctx->re > ctx->re_start
1325                           && *(ctx->re - 1) == CHAR_BACKSLASH))
1326                     {
1327                       depth++;
1328                       if (ctx->re + 2 < ctx->re_end
1329                           && *(ctx->re + 1) == CHAR_QUESTIONMARK
1330                           && *(ctx->re + 2) == CHAR_COLON)
1331                         {
1332                           DPRINT(("tre_parse: group begin: '%.*" STRF
1333                                     "', no submatch\n", REST(ctx->re)));
1334                           /* Don't mark for submatching. */
1335                           ctx->re += 3;
1336                           STACK_PUSHX(stack, long, PARSE_RE);
1337                         }
1338                       else
1339                         {
1340                           DPRINT(("tre_parse: group begin: '%.*" STRF
1341                                     "', submatch %d\n", REST(ctx->re),
1342                                     ctx->submatch_id));
1343                           ctx->re++;
1344                           /* First parse a whole RE, then mark the resulting tree
1345                                for submatching. */
1346                           STACK_PUSHX(stack, long, ctx->submatch_id);
1347                           STACK_PUSHX(stack, long, PARSE_MARK_FOR_SUBMATCH);
1348                           STACK_PUSHX(stack, long, PARSE_RE);
1349                           ctx->submatch_id++;
1350                         }
1351                     }
1352                 else
1353                     goto parse_literal;
1354                 break;
1355 
1356               case CHAR_RPAREN:  /* end of current subexpression */
1357                 if ((ctx->cflags & REG_EXTENDED && depth > 0)
1358                       || (ctx->re > ctx->re_start
1359                           && *(ctx->re - 1) == CHAR_BACKSLASH))
1360                     {
1361                       DPRINT(("tre_parse:             empty: '%.*" STRF "'\n",
1362                                 REST(ctx->re)));
1363                       /* We were expecting an atom, but instead the current
1364                          subexpression was closed.          POSIX leaves the meaning of
1365                          this to be implementation-defined.  We interpret this as
1366                          an empty expression (which matches an empty string).  */
1367                       result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1368                       if (result == NULL)
1369                         return REG_ESPACE;
1370                       if (!(ctx->cflags & REG_EXTENDED))
1371                         ctx->re--;
1372                     }
1373                 else
1374                     goto parse_literal;
1375                 break;
1376 
1377               case CHAR_LBRACKET: /* bracket expression */
1378                 DPRINT(("tre_parse:     bracket: '%.*" STRF "'\n",
1379                           REST(ctx->re)));
1380                 ctx->re++;
1381                 status = tre_parse_bracket(ctx, &result);
1382                 if (status != REG_OK)
1383                     return status;
1384                 break;
1385 
1386               case CHAR_BACKSLASH:
1387                 /* If this is "\(" or "\)" chew off the backslash and
1388                      try again. */
1389                 if (!(ctx->cflags & REG_EXTENDED)
1390                       && ctx->re + 1 < ctx->re_end
1391                       && (*(ctx->re + 1) == CHAR_LPAREN
1392                           || *(ctx->re + 1) == CHAR_RPAREN))
1393                     {
1394                       ctx->re++;
1395                       STACK_PUSHX(stack, long, PARSE_ATOM);
1396                       break;
1397                     }
1398 
1399                 /* If a macro is used, parse the expanded macro recursively. */
1400                 {
1401                     tre_char_t buf[64];
1402                     tre_expand_macro(ctx->re + 1, ctx->re_end,
1403                                          buf, elementsof(buf));
1404                     if (buf[0] != 0)
1405                       {
1406                         tre_parse_ctx_t subctx;
1407                         memcpy(&subctx, ctx, sizeof(subctx));
1408                         subctx.re = buf;
1409                         subctx.len = tre_strlen(buf);
1410                         subctx.nofirstsub = 1;
1411                         status = tre_parse(&subctx);
1412                         if (status != REG_OK)
1413                           return status;
1414                         ctx->re += 2;
1415                         ctx->position = subctx.position;
1416                         result = subctx.result;
1417                         break;
1418                       }
1419                 }
1420 
1421                 if (ctx->re + 1 >= ctx->re_end)
1422                     /* Trailing backslash. */
1423                     return REG_EESCAPE;
1424 
1425 #ifdef REG_LITERAL
1426                 if (*(ctx->re + 1) == L'Q')
1427                     {
1428                       DPRINT(("tre_parse: tmp literal: '%.*" STRF "'\n",
1429                                 REST(ctx->re)));
1430                       ctx->cflags |= REG_LITERAL;
1431                       temporary_cflags |= REG_LITERAL;
1432                       ctx->re += 2;
1433                       STACK_PUSHX(stack, long, PARSE_ATOM);
1434                       break;
1435                     }
1436 #endif /* REG_LITERAL */
1437 
1438                 DPRINT(("tre_parse:  bleep: '%.*" STRF "'\n", REST(ctx->re)));
1439                 ctx->re++;
1440                 switch (*ctx->re)
1441                     {
1442                     case L'b':
1443                       result = tre_ast_new_literal(ctx->mem, ASSERTION,
1444                                                          ASSERT_AT_WB, -1);
1445                       ctx->re++;
1446                       break;
1447                     case L'B':
1448                       result = tre_ast_new_literal(ctx->mem, ASSERTION,
1449                                                          ASSERT_AT_WB_NEG, -1);
1450                       ctx->re++;
1451                       break;
1452                     case L'<':
1453                       result = tre_ast_new_literal(ctx->mem, ASSERTION,
1454                                                          ASSERT_AT_BOW, -1);
1455                       ctx->re++;
1456                       break;
1457                     case L'>':
1458                       result = tre_ast_new_literal(ctx->mem, ASSERTION,
1459                                                          ASSERT_AT_EOW, -1);
1460                       ctx->re++;
1461                       break;
1462                     case L'x':
1463                       ctx->re++;
1464                       if (ctx->re[0] != CHAR_LBRACE && ctx->re < ctx->re_end)
1465                         {
1466                           /* 8 bit hex char. */
1467                           char tmp[3] = {0, 0, 0};
1468                           long val;
1469                           DPRINT(("tre_parse:  8 bit hex: '%.*" STRF "'\n",
1470                                     REST(ctx->re - 2)));
1471 
1472                           if (tre_isxdigit(ctx->re[0]) && ctx->re < ctx->re_end)
1473                               {
1474                                 tmp[0] = (char)ctx->re[0];
1475                                 ctx->re++;
1476                               }
1477                           if (tre_isxdigit(ctx->re[0]) && ctx->re < ctx->re_end)
1478                               {
1479                                 tmp[1] = (char)ctx->re[0];
1480                                 ctx->re++;
1481                               }
1482                           val = strtol(tmp, NULL, 16);
1483                           result = tre_ast_new_literal(ctx->mem, (int)val,
1484                                                                (int)val, ctx->position);
1485                           ctx->position++;
1486                           break;
1487                         }
1488                       else if (ctx->re < ctx->re_end)
1489                         {
1490                           /* Wide char. */
1491                           char tmp[32];
1492                           long val;
1493                           int i = 0;
1494                           ctx->re++;
1495                           while (ctx->re_end - ctx->re >= 0)
1496                               {
1497                                 if (ctx->re[0] == CHAR_RBRACE)
1498                                   break;
1499                                 if (tre_isxdigit(ctx->re[0]))
1500                                   {
1501                                     tmp[i] = (char)ctx->re[0];
1502                                     i++;
1503                                     ctx->re++;
1504                                     continue;
1505                                   }
1506                                 return REG_EBRACE;
1507                               }
1508                           ctx->re++;
1509                           tmp[i] = 0;
1510                           val = strtol(tmp, NULL, 16);
1511                           result = tre_ast_new_literal(ctx->mem, (int)val, (int)val,
1512                                                                ctx->position);
1513                           ctx->position++;
1514                           break;
1515                         }
1516                       /*FALLTHROUGH*/
1517 
1518                     default:
1519                       if (tre_isdigit(*ctx->re))
1520                         {
1521                           /* Back reference. */
1522                           int val = *ctx->re - L'0';
1523                           DPRINT(("tre_parse:     backref: '%.*" STRF "'\n",
1524                                     REST(ctx->re - 1)));
1525                           result = tre_ast_new_literal(ctx->mem, BACKREF, val,
1526                                                                ctx->position);
1527                           if (result == NULL)
1528                               return REG_ESPACE;
1529                           ctx->position++;
1530                           ctx->max_backref = MAX(val, ctx->max_backref);
1531                           ctx->re++;
1532                         }
1533                       else
1534                         {
1535                           /* Escaped character. */
1536                           DPRINT(("tre_parse:     escaped: '%.*" STRF "'\n",
1537                                     REST(ctx->re - 1)));
1538                           result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
1539                                                                ctx->position);
1540                           ctx->position++;
1541                           ctx->re++;
1542                         }
1543                       break;
1544                     }
1545                 if (result == NULL)
1546                     return REG_ESPACE;
1547                 break;
1548 
1549               case CHAR_PERIOD:          /* the any-symbol */
1550                 DPRINT(("tre_parse:       any: '%.*" STRF "'\n",
1551                           REST(ctx->re)));
1552                 if (ctx->cflags & REG_NEWLINE)
1553                     {
1554                       tre_ast_node_t *tmp1;
1555                       tre_ast_node_t *tmp2;
1556                       tmp1 = tre_ast_new_literal(ctx->mem, 0, L'\n' - 1,
1557                                                        ctx->position);
1558                       if (!tmp1)
1559                         return REG_ESPACE;
1560                       tmp2 = tre_ast_new_literal(ctx->mem, L'\n' + 1, TRE_CHAR_MAX,
1561                                                        ctx->position + 1);
1562                       if (!tmp2)
1563                         return REG_ESPACE;
1564                       result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
1565                       if (!result)
1566                         return REG_ESPACE;
1567                       ctx->position += 2;
1568                     }
1569                 else
1570                     {
1571                       result = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX,
1572                                                          ctx->position);
1573                       if (!result)
1574                         return REG_ESPACE;
1575                       ctx->position++;
1576                     }
1577                 ctx->re++;
1578                 break;
1579 
1580               case CHAR_CARET:           /* beginning of line assertion */
1581                 /* '^' has a special meaning everywhere in EREs, and in the
1582                      beginning of the RE and after \( is BREs. */
1583                 if (ctx->cflags & REG_EXTENDED
1584                       || (ctx->re - 2 >= ctx->re_start
1585                           && *(ctx->re - 2) == CHAR_BACKSLASH
1586                           && *(ctx->re - 1) == CHAR_LPAREN)
1587                       || ctx->re == ctx->re_start)
1588                     {
1589                       DPRINT(("tre_parse:               BOL: '%.*" STRF "'\n",
1590                                 REST(ctx->re)));
1591                       result = tre_ast_new_literal(ctx->mem, ASSERTION,
1592                                                          ASSERT_AT_BOL, -1);
1593                       if (result == NULL)
1594                         return REG_ESPACE;
1595                       ctx->re++;
1596                     }
1597                 else
1598                     goto parse_literal;
1599                 break;
1600 
1601               case CHAR_DOLLAR:          /* end of line assertion. */
1602                 /* '$' is special everywhere in EREs, and in the end of the
1603                      string and before \) is BREs. */
1604                 if (ctx->cflags & REG_EXTENDED
1605                       || (ctx->re + 2 < ctx->re_end
1606                           && *(ctx->re + 1) == CHAR_BACKSLASH
1607                           && *(ctx->re + 2) == CHAR_RPAREN)
1608                       || ctx->re + 1 == ctx->re_end)
1609                     {
1610                       DPRINT(("tre_parse:               EOL: '%.*" STRF "'\n",
1611                                 REST(ctx->re)));
1612                       result = tre_ast_new_literal(ctx->mem, ASSERTION,
1613                                                          ASSERT_AT_EOL, -1);
1614                       if (result == NULL)
1615                         return REG_ESPACE;
1616                       ctx->re++;
1617                     }
1618                 else
1619                     goto parse_literal;
1620                 break;
1621 
1622               default:
1623               parse_literal:
1624 
1625                 if (temporary_cflags && ctx->re + 1 < ctx->re_end
1626                       && *ctx->re == CHAR_BACKSLASH && *(ctx->re + 1) == L'E')
1627                     {
1628                       DPRINT(("tre_parse:          end tmps: '%.*" STRF "'\n",
1629                                 REST(ctx->re)));
1630                       ctx->cflags &= ~temporary_cflags;
1631                       temporary_cflags = 0;
1632                       ctx->re += 2;
1633                       STACK_PUSHX(stack, long, PARSE_PIECE);
1634                       break;
1635                     }
1636 
1637 
1638                 /* We are expecting an atom.  If the subexpression (or the whole
1639                      regexp ends here, we interpret it as an empty expression
1640                      (which matches an empty string).  */
1641                 if (
1642 #ifdef REG_LITERAL
1643                       !(ctx->cflags & REG_LITERAL) &&
1644 #endif /* REG_LITERAL */
1645                       (ctx->re >= ctx->re_end
1646                        || *ctx->re == CHAR_STAR
1647                        || (ctx->cflags & REG_EXTENDED
1648                            && (*ctx->re == CHAR_PIPE
1649                                  || *ctx->re == CHAR_LBRACE
1650                                  || *ctx->re == CHAR_PLUS
1651                                  || *ctx->re == CHAR_QUESTIONMARK))
1652                        /* Test for "\)" in BRE mode. */
1653                        || (!(ctx->cflags & REG_EXTENDED)
1654                            && ctx->re + 1 < ctx->re_end
1655                            && *ctx->re == CHAR_BACKSLASH
1656                            && *(ctx->re + 1) == CHAR_LBRACE)))
1657                     {
1658                       DPRINT(("tre_parse:             empty: '%.*" STRF "'\n",
1659                                 REST(ctx->re)));
1660                       result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1661                       if (!result)
1662                         return REG_ESPACE;
1663                       break;
1664                     }
1665 
1666                 DPRINT(("tre_parse:     literal: '%.*" STRF "'\n",
1667                           REST(ctx->re)));
1668                 /* Note that we can't use an tre_isalpha() test here, since there
1669                      may be characters which are alphabetic but neither upper or
1670                      lower case. */
1671                 if (ctx->cflags & REG_ICASE
1672                       && (tre_isupper(*ctx->re) || tre_islower(*ctx->re)))
1673                     {
1674                       tre_ast_node_t *tmp1;
1675                       tre_ast_node_t *tmp2;
1676 
1677                       /* XXX - Can there be more than one opposite-case
1678                          counterpoints for some character in some locale?  Or
1679                          more than two characters which all should be regarded
1680                          the same character if case is ignored?  If yes, there
1681                          does not seem to be a portable way to detect it.  I guess
1682                          that at least for multi-character collating elements there
1683                          could be several opposite-case counterpoints, but they
1684                          cannot be supported portably anyway. */
1685                       tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(*ctx->re),
1686                                                        tre_toupper(*ctx->re),
1687                                                        ctx->position);
1688                       if (!tmp1)
1689                         return REG_ESPACE;
1690                       tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(*ctx->re),
1691                                                        tre_tolower(*ctx->re),
1692                                                        ctx->position);
1693                       if (!tmp2)
1694                         return REG_ESPACE;
1695                       result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
1696                       if (!result)
1697                         return REG_ESPACE;
1698                     }
1699                 else
1700                     {
1701                       result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
1702                                                          ctx->position);
1703                       if (!result)
1704                         return REG_ESPACE;
1705                     }
1706                 ctx->position++;
1707                 ctx->re++;
1708                 break;
1709               }
1710             break;
1711 
1712           case PARSE_MARK_FOR_SUBMATCH:
1713             {
1714               int submatch_id = tre_stack_pop_int(stack);
1715 
1716               if (result->submatch_id >= 0)
1717                 {
1718                     tre_ast_node_t *n, *tmp_node;
1719                     n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1720                     if (n == NULL)
1721                       return REG_ESPACE;
1722                     tmp_node = tre_ast_new_catenation(ctx->mem, n, result);
1723                     if (tmp_node == NULL)
1724                       return REG_ESPACE;
1725                     tmp_node->num_submatches = result->num_submatches;
1726                     result = tmp_node;
1727                 }
1728               result->submatch_id = submatch_id;
1729               result->num_submatches++;
1730               break;
1731             }
1732 
1733           case PARSE_RESTORE_CFLAGS:
1734             ctx->cflags = tre_stack_pop_int(stack);
1735             break;
1736 
1737           default:
1738             assert(0);
1739             break;
1740           }
1741     }
1742 
1743   /* Check for missing closing parentheses. */
1744   if (depth > 0)
1745     return REG_EPAREN;
1746 
1747   if (status == REG_OK)
1748     ctx->result = result;
1749 
1750   return status;
1751 }
1752 
1753 /* EOF */
1754