1 /*        $NetBSD: regcomp.c,v 1.49 2025/01/01 18:19:50 christos Exp $          */
2 
3 /*-
4  * SPDX-License-Identifier: BSD-3-Clause
5  *
6  * Copyright (c) 1992, 1993, 1994 Henry Spencer.
7  * Copyright (c) 1992, 1993, 1994
8  *        The Regents of the University of California.  All rights reserved.
9  *
10  * Copyright (c) 2011 The FreeBSD Foundation
11  * All rights reserved.
12  * Portions of this software were developed by David Chisnall
13  * under sponsorship from the FreeBSD Foundation.
14  *
15  * This code is derived from software contributed to Berkeley by
16  * Henry Spencer.
17  *
18  * Redistribution and use in source and binary forms, with or without
19  * modification, are permitted provided that the following conditions
20  * are met:
21  * 1. Redistributions of source code must retain the above copyright
22  *    notice, this list of conditions and the following disclaimer.
23  * 2. Redistributions in binary form must reproduce the above copyright
24  *    notice, this list of conditions and the following disclaimer in the
25  *    documentation and/or other materials provided with the distribution.
26  * 3. Neither the name of the University nor the names of its contributors
27  *    may be used to endorse or promote products derived from this software
28  *    without specific prior written permission.
29  *
30  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
31  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
32  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
34  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
35  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
36  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
37  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
38  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
39  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
40  * SUCH DAMAGE.
41  *
42  *        @(#)regcomp.c       8.5 (Berkeley) 3/20/94
43  */
44 
45 #if HAVE_NBTOOL_CONFIG_H
46 #include "nbtool_config.h"
47 #endif
48 
49 #include <sys/cdefs.h>
50 #if 0
51 static char sccsid[] = "@(#)regcomp.c   8.5 (Berkeley) 3/20/94";
52 __FBSDID("$FreeBSD: head/lib/libc/regex/regcomp.c 368359 2020-12-05 03:18:48Z kevans $");
53 #endif
54 __RCSID("$NetBSD: regcomp.c,v 1.49 2025/01/01 18:19:50 christos Exp $");
55 
56 #ifndef LIBHACK
57 #define REGEX_GNU_EXTENSIONS
58 
59 #include "namespace.h"
60 #endif
61 #include <sys/types.h>
62 #include <stdio.h>
63 #include <string.h>
64 #include <ctype.h>
65 #include <limits.h>
66 #include <stdlib.h>
67 #include <regex.h>
68 #include <stdbool.h>
69 
70 #if defined(__weak_alias) && !defined(LIBHACK)
71 __weak_alias(regcomp,_regcomp)
72 #endif
73 
74 #ifdef REGEX_LIBC_COLLATE
75 #include "collate.h"
76 #endif
77 
78 #include "utils.h"
79 #include "regex2.h"
80 
81 #include "cname.h"
82 
83 /*
84  * Branching context, used to keep track of branch state for all of the branch-
85  * aware functions. In addition to keeping track of branch positions for the
86  * p_branch_* functions, we use this to simplify some clumsiness in BREs for
87  * detection of whether ^ is acting as an anchor or being used erroneously and
88  * also for whether we're in a sub-expression or not.
89  */
90 struct branchc {
91           sopno start;
92           sopno back;
93           sopno fwd;
94 
95           int nbranch;
96           int nchain;
97           bool outer;
98           bool terminate;
99 };
100 
101 /*
102  * parse structure, passed up and down to avoid global variables and
103  * other clumsinesses
104  */
105 struct parse {
106           const char *next;   /* next character in RE */
107           const char *end;    /* end of string (-> NUL normally) */
108           int error;                    /* has an error been seen? */
109           int gnuext;
110           sop *strip;                   /* malloced strip */
111           sopno ssize;                  /* malloced strip size (allocated) */
112           sopno slen;                   /* malloced strip length (used) */
113           size_t ncsalloc;    /* number of csets allocated */
114           struct re_guts *g;
115 #         define    NPAREN    10        /* we need to remember () 1-9 for back refs */
116           sopno pbegin[NPAREN];         /* -> ( ([0] unused) */
117           sopno pend[NPAREN]; /* -> ) ([0] unused) */
118           bool allowbranch;   /* can this expression branch? */
119           bool bre;           /* convenience; is this a BRE? */
120           int pflags;                   /* other parsing flags -- legacy escapes? */
121           bool (*parse_expr)(struct parse *, struct branchc *);
122           void (*pre_parse)(struct parse *, struct branchc *);
123           void (*post_parse)(struct parse *, struct branchc *);
124 };
125 
126 #define PFLAG_LEGACY_ESC      0x00000001
127 
128 /* ========= begin header generated by ./mkh ========= */
129 #ifdef __cplusplus
130 extern "C" {
131 #endif
132 
133 /* === regcomp.c === */
134 static bool p_ere_exp(struct parse *p, struct branchc *bc);
135 static void p_str(struct parse *p);
136 static int p_branch_eat_delim(struct parse *p, struct branchc *bc);
137 static void p_branch_ins_offset(struct parse *p, struct branchc *bc);
138 static void p_branch_fix_tail(struct parse *p, struct branchc *bc);
139 static bool p_branch_empty(struct parse *p, struct branchc *bc);
140 static bool p_branch_do(struct parse *p, struct branchc *bc);
141 static void p_bre_pre_parse(struct parse *p, struct branchc *bc);
142 static void p_bre_post_parse(struct parse *p, struct branchc *bc);
143 static void p_re(struct parse *p, int end1, int end2);
144 static bool p_simp_re(struct parse *p, struct branchc *bc);
145 static int p_count(struct parse *p);
146 static void p_bracket(struct parse *p);
147 static int p_range_cmp(wchar_t c1, wchar_t c2);
148 static void p_b_term(struct parse *p, cset *cs);
149 #ifdef REGEX_GNU_EXTENSIONS
150 static int p_b_pseudoclass(struct parse *p, char c);
151 #endif
152 static void p_b_cclass(struct parse *p, cset *cs);
153 static void p_b_cclass_named(struct parse *p, cset *cs, const char[]);
154 static void p_b_eclass(struct parse *p, cset *cs);
155 static wint_t p_b_symbol(struct parse *p);
156 static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
157 static bool may_escape(struct parse *p, const wint_t ch);
158 static wint_t othercase(wint_t ch);
159 static void bothcases(struct parse *p, wint_t ch);
160 static void ordinary(struct parse *p, wint_t ch);
161 static void nonnewline(struct parse *p);
162 static void repeat(struct parse *p, sopno start, int from, int to);
163 static int seterr(struct parse *p, int e);
164 static cset *allocset(struct parse *p);
165 static void freeset(struct parse *p, cset *cs);
166 static void CHadd(struct parse *p, cset *cs, wint_t ch);
167 static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max);
168 static void CHaddtype(struct parse *p, cset *cs, wctype_t wct);
169 static wint_t singleton(cset *cs);
170 static sopno dupl(struct parse *p, sopno start, sopno finish);
171 static void doemit(struct parse *p, sop op, size_t opnd);
172 static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
173 static void dofwd(struct parse *p, sopno pos, sop value);
174 static int enlarge(struct parse *p, sopno size);
175 static void stripsnug(struct parse *p, struct re_guts *g);
176 static void findmust(struct parse *p, struct re_guts *g);
177 static int altoffset(sop *scan, int offset);
178 static void computejumps(struct parse *p, struct re_guts *g);
179 static void computematchjumps(struct parse *p, struct re_guts *g);
180 static sopno pluscount(struct parse *p, struct re_guts *g);
181 static wint_t wgetnext(struct parse *p);
182 
183 #ifdef __cplusplus
184 }
185 #endif
186 /* ========= end header generated by ./mkh ========= */
187 
188 static char nuls[10];                   /* place to point scanner in event of error */
189 
190 /*
191  * macros for use with parse structure
192  * BEWARE:  these know that the parse structure is named `p' !!!
193  */
194 #define   PEEK()    (*p->next)
195 #define   PEEK2()   (*(p->next+1))
196 #define   MORE()    (p->next < p->end)
197 #define   MORE2()   (p->next+1 < p->end)
198 #define   SEE(c)    (MORE() && PEEK() == (c))
199 #define   SEETWO(a, b)        (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
200 #define   SEESPEC(a)          (p->bre ? SEETWO('\\', a) : SEE(a))
201 #define   EAT(c)    ((SEE(c)) ? (NEXT(), 1) : 0)
202 #define   EATTWO(a, b)        ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
203 #define   EATSPEC(a)          (p->bre ? EATTWO('\\', a) : EAT(a))
204 #define   NEXT()    (p->next++)
205 #define   NEXT2()   (p->next += 2)
206 #define   NEXTn(n)  (p->next += (n))
207 #define   GETNEXT() (*p->next++)
208 #define   WGETNEXT()          wgetnext(p)
209 #define   SETERROR(e)         seterr(p, (e))
210 #define   REQUIRE(co, e)      ((co) || SETERROR(e))
211 #define   MUSTSEE(c, e)       (REQUIRE(MORE() && PEEK() == (c), e))
212 #define   MUSTEAT(c, e)       (REQUIRE(MORE() && GETNEXT() == (c), e))
213 #define   MUSTNOTSEE(c, e)    (REQUIRE(!MORE() || PEEK() != (c), e))
214 #define   EMIT(op, sopnd)     doemit(p, (op), (sopnd))
215 #define   INSERT(op, pos)     doinsert(p, (op), HERE()-(pos)+1, pos)
216 #define   AHEAD(pos)                    dofwd(p, pos, HERE()-(pos))
217 #define   ASTERN(sop, pos)    EMIT(sop, HERE()-pos)
218 #define   HERE()              (p->slen)
219 #define   THERE()             (p->slen - 1)
220 #define   THERETHERE()        (p->slen - 2)
221 #define   DROP(n)   (p->slen -= (n))
222 
223 /* Macro used by computejump()/computematchjump() */
224 #ifndef MIN
225 #define MIN(a,b)    ((a)<(b)?(a):(b))
226 #endif
227 
228 #ifndef NLS
229 static const struct {
230           const char *name;
231           int (*func)(int);
232 } wctypes[] = {
233 #define ADD(x) { .name = # x, .func = is ## x }
234           ADD(alnum),
235           ADD(alpha),
236           ADD(blank),
237           ADD(cntrl),
238           ADD(digit),
239           ADD(graph),
240           ADD(lower),
241           ADD(print),
242           ADD(punct),
243           ADD(space),
244           ADD(upper),
245           ADD(xdigit),
246 #undef ADD
247 };
248 
249 wctype_t
__regex_wctype(const char * str)250 __regex_wctype(const char *str)
251 {
252           for (size_t i = 0; i < __arraycount(wctypes); i++) {
253                     if (strcmp(wctypes[i].name, str) == 0)
254                               return (wctype_t)(i + 1);
255           }
256           return (wctype_t)0;
257 }
258 
259 int
__regex_iswctype(wint_t c,wctype_t ct)260 __regex_iswctype(wint_t c, wctype_t ct)
261 {
262           if (ct == 0)
263                     return 0;
264           return (*wctypes[ct - 1].func)(c);
265 }
266 #endif
267 
268 static int                                        /* 0 success, otherwise REG_something */
regcomp_internal(regex_t * __restrict preg,const char * __restrict pattern,int cflags,int pflags)269 regcomp_internal(regex_t * __restrict preg,
270           const char * __restrict pattern,
271           int cflags, int pflags)
272 {
273           struct parse pa;
274           struct re_guts *g;
275           struct parse *p = &pa;
276           int i;
277           size_t len;
278           size_t maxlen;
279 #ifdef REDEBUG
280 #         define    GOODFLAGS(f)        (f)
281 #else
282 #         define    GOODFLAGS(f)        ((f)&~REG_DUMP)
283 #endif
284 
285           _DIAGASSERT(preg != NULL);
286           _DIAGASSERT(pattern != NULL);
287 
288           cflags = GOODFLAGS(cflags);
289           if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
290                     return(REG_INVARG);
291 
292           if (cflags&REG_PEND) {
293                     if (preg->re_endp < pattern)
294                               return(REG_INVARG);
295                     len = preg->re_endp - pattern;
296           } else
297                     len = strlen(pattern);
298 
299           /* do the mallocs early so failure handling is easy */
300           g = malloc(sizeof(*g));
301           if (g == NULL)
302                     return(REG_ESPACE);
303           /*
304            * Limit the pattern space to avoid a 32-bit overflow on buffer
305            * extension.  Also avoid any signed overflow in case of conversion
306            * so make the real limit based on a 31-bit overflow.
307            *
308            * Likely not applicable on 64-bit systems but handle the case
309            * generically (who are we to stop people from using ~715MB+
310            * patterns?).
311            */
312           maxlen = ((size_t)-1 >> 1) / sizeof(*p->strip) * 2 / 3;
313           if (len >= maxlen) {
314                     free(g);
315                     return(REG_ESPACE);
316           }
317           p->ssize = (sopno)(len / 2 * 3 + 1);    /* ugh */
318           assert(p->ssize >= len);
319 
320           p->strip = calloc(p->ssize, sizeof(*p->strip));
321           p->slen = 0;
322           if (p->strip == NULL) {
323                     free(g);
324                     return(REG_ESPACE);
325           }
326 
327           /* set things up */
328           p->g = g;
329           p->next = pattern;  /* convenience; we do not modify it */
330           p->end = p->next + len;
331           p->error = 0;
332           p->ncsalloc = 0;
333           p->pflags = pflags;
334           for (i = 0; i < NPAREN; i++) {
335                     p->pbegin[i] = 0;
336                     p->pend[i] = 0;
337           }
338 #ifdef REGEX_GNU_EXTENSIONS
339           if ((cflags & REG_GNU) == 0) {
340                     p->gnuext = false;
341                     p->allowbranch = (cflags & REG_EXTENDED) != 0;
342           } else
343                     p->gnuext = p->allowbranch = true;
344 #else
345           p->gnuext = false;
346           p->allowbranch = (cflags & REG_EXTENDED) != 0;
347 #endif
348           if (cflags & REG_EXTENDED) {
349                     p->bre = false;
350                     p->parse_expr = p_ere_exp;
351                     p->pre_parse = NULL;
352                     p->post_parse = NULL;
353           } else {
354                     p->bre = true;
355                     p->parse_expr = p_simp_re;
356                     p->pre_parse = p_bre_pre_parse;
357                     p->post_parse = p_bre_post_parse;
358           }
359           g->sets = NULL;
360           g->ncsets = 0;
361           g->cflags = cflags;
362           g->iflags = 0;
363           g->nbol = 0;
364           g->neol = 0;
365           g->must = NULL;
366           g->moffset = -1;
367           g->charjump = NULL;
368           g->matchjump = NULL;
369           g->mlen = 0;
370           g->nsub = 0;
371           g->backrefs = 0;
372 
373           /* do it */
374           EMIT(OEND, 0);
375           g->firststate = THERE();
376           if (cflags & REG_NOSPEC)
377                     p_str(p);
378           else
379                     p_re(p, OUT, OUT);
380           EMIT(OEND, 0);
381           g->laststate = THERE();
382 
383           /* tidy up loose ends and fill things in */
384           stripsnug(p, g);
385           findmust(p, g);
386           /* only use Boyer-Moore algorithm if the pattern is bigger
387            * than three characters
388            */
389           if(g->mlen > 3) {
390                     computejumps(p, g);
391                     computematchjumps(p, g);
392                     if(g->matchjump == NULL && g->charjump != NULL) {
393                               free(g->charjump);
394                               g->charjump = NULL;
395                     }
396           }
397           g->nplus = pluscount(p, g);
398           g->magic = MAGIC2;
399           preg->re_nsub = g->nsub;
400           preg->re_g = g;
401           preg->re_magic = MAGIC1;
402 #ifndef REDEBUG
403           /* not debugging, so can't rely on the assert() in regexec() */
404           if (g->iflags&BAD)
405                     SETERROR(REG_ASSERT);
406 #endif
407 
408           /* win or lose, we're done */
409           if (p->error != 0)  /* lose */
410                     regfree(preg);
411           return(p->error);
412 }
413 
414 /*
415  - regcomp - interface for parser and compilation
416  = extern int regcomp(regex_t *, const char *, int);
417  = #define          REG_BASIC 0000
418  = #define          REG_EXTENDED        0001
419  = #define          REG_ICASE 0002
420  = #define          REG_NOSUB 0004
421  = #define          REG_NEWLINE         0010
422  = #define          REG_NOSPEC          0020
423  = #define          REG_PEND  0040
424  = #define          REG_DUMP  0200
425  */
426 int                                     /* 0 success, otherwise REG_something */
regcomp(regex_t * __restrict preg,const char * __restrict pattern,int cflags)427 regcomp(regex_t * __restrict preg,
428           const char * __restrict pattern,
429           int cflags)
430 {
431 
432           return (regcomp_internal(preg, pattern, cflags, 0));
433 }
434 
435 /*
436  - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op,
437  - return whether we should terminate or not
438  == static bool p_ere_exp(struct parse *p);
439  */
440 static bool
p_ere_exp(struct parse * p,struct branchc * bc)441 p_ere_exp(struct parse *p, struct branchc *bc)
442 {
443           char c;
444           wint_t wc;
445           sopno pos;
446           int count;
447           int count2;
448 #ifdef REGEX_GNU_EXTENSIONS
449           size_t i;
450           int handled;
451 #endif
452           sopno subno;
453           int wascaret = 0;
454 
455           _DIAGASSERT(p != NULL);
456 
457           (void)bc;
458           assert(MORE());               /* caller should have ensured this */
459           c = GETNEXT();
460 
461 #ifdef REGEX_GNU_EXTENSIONS
462           handled = 0;
463 #endif
464           pos = HERE();
465           switch (c) {
466           case '(':
467                     (void)REQUIRE(MORE(), REG_EPAREN);
468                     p->g->nsub++;
469                     subno = (sopno)p->g->nsub;
470                     if (subno < NPAREN)
471                               p->pbegin[subno] = HERE();
472                     EMIT(OLPAREN, subno);
473                     if (!SEE(')'))
474                               p_re(p, ')', IGN);
475                     if (subno < NPAREN) {
476                               p->pend[subno] = HERE();
477                               assert(p->pend[subno] != 0);
478                     }
479                     EMIT(ORPAREN, subno);
480                     (void)MUSTEAT(')', REG_EPAREN);
481                     break;
482 #ifndef POSIX_MISTAKE
483           case ')':           /* happens only if no current unmatched ( */
484                     /*
485                      * You may ask, why the ifndef?  Because I didn't notice
486                      * this until slightly too late for 1003.2, and none of the
487                      * other 1003.2 regular-expression reviewers noticed it at
488                      * all.  So an unmatched ) is legal POSIX, at least until
489                      * we can get it fixed.
490                      */
491                     SETERROR(REG_EPAREN);
492                     break;
493 #endif
494           case '^':
495                     EMIT(OBOL, 0);
496                     p->g->iflags |= USEBOL;
497                     p->g->nbol++;
498                     wascaret = 1;
499                     break;
500           case '$':
501                     EMIT(OEOL, 0);
502                     p->g->iflags |= USEEOL;
503                     p->g->neol++;
504                     break;
505           case '|':
506                     SETERROR(REG_EMPTY);
507                     break;
508           case '*':
509           case '+':
510           case '?':
511           case '{':
512                     SETERROR(REG_BADRPT);
513                     break;
514           case '.':
515                     if (p->g->cflags&REG_NEWLINE)
516                               nonnewline(p);
517                     else
518                               EMIT(OANY, 0);
519                     break;
520           case '[':
521                     p_bracket(p);
522                     break;
523           case '\\':
524                     (void)REQUIRE(MORE(), REG_EESCAPE);
525                     wc = WGETNEXT();
526 #ifdef REGEX_GNU_EXTENSIONS
527                     if (p->gnuext) {
528                               handled = 1;
529                               switch (wc) {
530                               case '`':
531                                         EMIT(OBOS, 0);
532                                         break;
533                               case '\'':
534                                         EMIT(OEOS, 0);
535                                         break;
536                               case 'B':
537                                         EMIT(ONWBND, 0);
538                                         break;
539                               case 'b':
540                                         EMIT(OWBND, 0);
541                                         break;
542                               case 'W':
543                               case 'w':
544                               case 'S':
545                               case 's':
546                                         p_b_pseudoclass(p, wc);
547                                         break;
548                               case 'a':
549                                         ordinary(p, '\a');
550                                         break;
551                               case 'e':
552                                         ordinary(p, '\e');
553                                         break;
554                               case 'f':
555                                         ordinary(p, '\f');
556                                         break;
557                               case 'n':
558                                         ordinary(p, '\n');
559                                         break;
560                               case 'r':
561                                         ordinary(p, '\r');
562                                         break;
563                               case 't':
564                                         ordinary(p, '\t');
565                                         break;
566                               case 'v':
567                                         ordinary(p, '\v');
568                                         break;
569                               case '1':
570                               case '2':
571                               case '3':
572                               case '4':
573                               case '5':
574                               case '6':
575                               case '7':
576                               case '8':
577                               case '9':
578                                         i = wc - '0';
579                                         assert(i < NPAREN);
580                                         if (p->pend[i] != 0) {
581                                                   assert(i <= p->g->nsub);
582                                                   EMIT(OBACK_, i);
583                                                   assert(p->pbegin[i] != 0);
584                                                   assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
585                                                   assert(OP(p->strip[p->pend[i]]) == ORPAREN);
586                                                   (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
587                                                   EMIT(O_BACK, i);
588                                         } else
589                                                   SETERROR(REG_ESUBREG);
590                                         p->g->backrefs = 1;
591                                         break;
592                               default:
593                                         handled = 0;
594                               }
595                               /* Don't proceed to the POSIX bits if we've already handled it */
596                               if (handled)
597                                         break;
598                     }
599 #endif
600                     switch (wc) {
601                     case '<':
602                               EMIT(OBOW, 0);
603                               break;
604                     case '>':
605                               EMIT(OEOW, 0);
606                               break;
607                     default:
608                               if (may_escape(p, wc))
609                                         ordinary(p, wc);
610                               else
611                                         SETERROR(REG_EESCAPE);
612                               break;
613                     }
614                     break;
615           default:
616                     if (p->error != 0)
617                               return (false);
618                     p->next--;
619                     wc = WGETNEXT();
620                     ordinary(p, wc);
621                     break;
622           }
623 
624           if (!MORE())
625                     return (false);
626           c = PEEK();
627           /* we call { a repetition if followed by a digit */
628           if (!( c == '*' || c == '+' || c == '?' || c == '{'))
629                     return (false);               /* no repetition, we're done */
630           else if (c == '{')
631                     (void)REQUIRE(MORE2() && \
632                         (isdigit((uch)PEEK2()) || PEEK2() == ','), REG_BADRPT);
633           NEXT();
634 
635           (void)REQUIRE(!wascaret, REG_BADRPT);
636           switch (c) {
637           case '*': /* implemented as +? */
638                     /* this case does not require the (y|) trick, noKLUDGE */
639                     INSERT(OPLUS_, pos);
640                     ASTERN(O_PLUS, pos);
641                     INSERT(OQUEST_, pos);
642                     ASTERN(O_QUEST, pos);
643                     break;
644           case '+':
645                     INSERT(OPLUS_, pos);
646                     ASTERN(O_PLUS, pos);
647                     break;
648           case '?':
649                     /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
650                     INSERT(OCH_, pos);            /* offset slightly wrong */
651                     ASTERN(OOR1, pos);            /* this one's right */
652                     AHEAD(pos);                             /* fix the OCH_ */
653                     EMIT(OOR2, 0);                          /* offset very wrong... */
654                     AHEAD(THERE());                         /* ...so fix it */
655                     ASTERN(O_CH, THERETHERE());
656                     break;
657           case '{':
658                     count = p_count(p);
659                     if (EAT(',')) {
660                               if (isdigit((uch)PEEK())) {
661                                         count2 = p_count(p);
662                                         (void)REQUIRE(count <= count2, REG_BADBR);
663                               } else              /* single number with comma */
664                                         count2 = INFINITY;
665                     } else              /* just a single number */
666                               count2 = count;
667                     repeat(p, pos, count, count2);
668                     if (!EAT('}')) {    /* error heuristics */
669                               while (MORE() && PEEK() != '}')
670                                         NEXT();
671                               (void)REQUIRE(MORE(), REG_EBRACE);
672                               SETERROR(REG_BADBR);
673                     }
674                     break;
675           }
676 
677           if (!MORE())
678                     return (false);
679           c = PEEK();
680           if (!( c == '*' || c == '+' || c == '?' ||
681                                         (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
682                     return (false);
683           SETERROR(REG_BADRPT);
684           return (false);
685 }
686 
687 /*
688  - p_str - string (no metacharacters) "parser"
689  == static void p_str(struct parse *p);
690  */
691 static void
p_str(struct parse * p)692 p_str(struct parse *p)
693 {
694           (void)REQUIRE(MORE(), REG_EMPTY);
695           while (MORE())
696                     ordinary(p, WGETNEXT());
697 }
698 
699 /*
700  * Eat consecutive branch delimiters for the kind of expression that we are
701  * parsing, return the number of delimiters that we ate.
702  */
703 static int
p_branch_eat_delim(struct parse * p,struct branchc * bc)704 p_branch_eat_delim(struct parse *p, struct branchc *bc)
705 {
706           int nskip;
707 
708           (void)bc;
709           nskip = 0;
710           while (EATSPEC('|'))
711                     ++nskip;
712           return (nskip);
713 }
714 
715 /*
716  * Insert necessary branch book-keeping operations. This emits a
717  * bogus 'next' offset, since we still have more to parse
718  */
719 static void
p_branch_ins_offset(struct parse * p,struct branchc * bc)720 p_branch_ins_offset(struct parse *p, struct branchc *bc)
721 {
722 
723           if (bc->nbranch == 0) {
724                     INSERT(OCH_, bc->start);      /* offset is wrong */
725                     bc->fwd = bc->start;
726                     bc->back = bc->start;
727           }
728 
729           ASTERN(OOR1, bc->back);
730           bc->back = THERE();
731           AHEAD(bc->fwd);                         /* fix previous offset */
732           bc->fwd = HERE();
733           EMIT(OOR2, 0);                          /* offset is very wrong */
734           ++bc->nbranch;
735 }
736 
737 /*
738  * Fix the offset of the tail branch, if we actually had any branches.
739  * This is to correct the bogus placeholder offset that we use.
740  */
741 static void
p_branch_fix_tail(struct parse * p,struct branchc * bc)742 p_branch_fix_tail(struct parse *p, struct branchc *bc)
743 {
744 
745           /* Fix bogus offset at the tail if we actually have branches */
746           if (bc->nbranch > 0) {
747                     AHEAD(bc->fwd);
748                     ASTERN(O_CH, bc->back);
749           }
750 }
751 
752 /*
753  * Signal to the parser that an empty branch has been encountered; this will,
754  * in the future, be used to allow for more permissive behavior with empty
755  * branches. The return value should indicate whether parsing may continue
756  * or not.
757  */
758 static bool
p_branch_empty(struct parse * p,struct branchc * bc)759 p_branch_empty(struct parse *p, struct branchc *bc)
760 {
761 
762           (void)bc;
763           SETERROR(REG_EMPTY);
764           return (false);
765 }
766 
767 /*
768  * Take care of any branching requirements. This includes inserting the
769  * appropriate branching instructions as well as eating all of the branch
770  * delimiters until we either run out of pattern or need to parse more pattern.
771  */
772 static bool
p_branch_do(struct parse * p,struct branchc * bc)773 p_branch_do(struct parse *p, struct branchc *bc)
774 {
775           int ate = 0;
776 
777           ate = p_branch_eat_delim(p, bc);
778           if (ate == 0)
779                     return (false);
780           else if ((ate > 1 || (bc->outer && !MORE())) && !p_branch_empty(p, bc))
781                     /*
782                      * Halt parsing only if we have an empty branch and p_branch_empty
783                      * indicates that we must not continue. In the future, this will not
784                      * necessarily be an error.
785                      */
786                     return (false);
787           p_branch_ins_offset(p, bc);
788 
789           return (true);
790 }
791 
792 static void
p_bre_pre_parse(struct parse * p,struct branchc * bc)793 p_bre_pre_parse(struct parse *p, struct branchc *bc)
794 {
795 
796           (void)bc;
797           /*
798            * Does not move cleanly into expression parser because of
799            * ordinary interpration of * at the beginning position of
800            * an expression.
801            */
802           if (EAT('^')) {
803                     EMIT(OBOL, 0);
804                     p->g->iflags |= USEBOL;
805                     p->g->nbol++;
806           }
807 }
808 
809 static void
p_bre_post_parse(struct parse * p,struct branchc * bc)810 p_bre_post_parse(struct parse *p, struct branchc *bc)
811 {
812 
813           /* Expression is terminating due to EOL token */
814           if (bc->terminate) {
815                     DROP(1);
816                     EMIT(OEOL, 0);
817                     p->g->iflags |= USEEOL;
818                     p->g->neol++;
819           }
820 }
821 
822 /*
823  - p_re - Top level parser, concatenation and BRE anchoring
824  == static void p_re(struct parse *p, int end1, int end2);
825  * Giving end1 as OUT essentially eliminates the end1/end2 check.
826  *
827  * This implementation is a bit of a kludge, in that a trailing $ is first
828  * taken as an ordinary character and then revised to be an anchor.
829  * The amount of lookahead needed to avoid this kludge is excessive.
830  */
831 static void
p_re(struct parse * p,int end1,int end2)832 p_re(struct parse *p,
833           int end1, /* first terminating character */
834           int end2) /* second terminating character; ignored for EREs */
835 {
836           struct branchc bc;
837 
838           bc.nbranch = 0;
839           if (end1 == OUT && end2 == OUT)
840                     bc.outer = true;
841           else
842                     bc.outer = false;
843 #define   SEEEND()  (!p->bre ? SEE(end1) : SEETWO(end1, end2))
844           for (;;) {
845                     bc.start = HERE();
846                     bc.nchain = 0;
847                     bc.terminate = false;
848                     if (p->pre_parse != NULL)
849                               p->pre_parse(p, &bc);
850                     while (MORE() && (!p->allowbranch || !SEESPEC('|')) && !SEEEND()) {
851                               bc.terminate = p->parse_expr(p, &bc);
852                               ++bc.nchain;
853                     }
854                     if (p->post_parse != NULL)
855                               p->post_parse(p, &bc);
856                     (void) REQUIRE(p->gnuext || HERE() != bc.start, REG_EMPTY);
857 #ifdef REGEX_GNU_EXTENSIONS
858                     if (p->gnuext && HERE() == bc.start && !p_branch_empty(p, &bc))
859                               break;
860 #endif
861                     if (!p->allowbranch)
862                               break;
863                     /*
864                      * p_branch_do's return value indicates whether we should
865                      * continue parsing or not. This is both for correctness and
866                      * a slight optimization, because it will check if we've
867                      * encountered an empty branch or the end of the string
868                      * immediately following a branch delimiter.
869                      */
870                     if (!p_branch_do(p, &bc))
871                               break;
872           }
873 #undef SEE_END
874           if (p->allowbranch)
875                     p_branch_fix_tail(p, &bc);
876           assert(!MORE() || SEE(end1));
877 }
878 
879 /*
880  - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
881  == static bool p_simp_re(struct parse *p, struct branchc *bc);
882  */
883 static bool                             /* was the simple RE an unbackslashed $? */
p_simp_re(struct parse * p,struct branchc * bc)884 p_simp_re(struct parse *p, struct branchc *bc)
885 {
886           int c;
887           int cc;                       /* convenient/control character */
888           int count;
889           int count2;
890           sopno pos;
891           bool handled;
892           size_t i;
893           wint_t wc;
894           sopno subno;
895 #         define    BACKSL    (1<<CHAR_BIT)
896 
897           pos = HERE();                 /* repetition op, if any, covers from here */
898           handled = false;
899 
900           assert(MORE());               /* caller should have ensured this */
901           c = (uch)GETNEXT();
902           if (c == '\\') {
903                     (void)REQUIRE(MORE(), REG_EESCAPE);
904                     cc = (uch)GETNEXT();
905                     c = BACKSL | cc;
906 #ifdef REGEX_GNU_EXTENSIONS
907                     if (p->gnuext) {
908                               handled = true;
909                               switch (c) {
910                               case BACKSL|'`':
911                                         EMIT(OBOS, 0);
912                                         break;
913                               case BACKSL|'\'':
914                                         EMIT(OEOS, 0);
915                                         break;
916                               case BACKSL|'B':
917                                         EMIT(ONWBND, 0);
918                                         break;
919                               case BACKSL|'b':
920                                         EMIT(OWBND, 0);
921                                         break;
922                               case BACKSL|'W':
923                               case BACKSL|'w':
924                               case BACKSL|'S':
925                               case BACKSL|'s':
926                                         p_b_pseudoclass(p, cc);
927                                         break;
928                               case BACKSL|'a':
929                                         ordinary(p, '\a');
930                                         break;
931                               case BACKSL|'e':
932                                         ordinary(p, '\e');
933                                         break;
934                               case BACKSL|'f':
935                                         ordinary(p, '\f');
936                                         break;
937                               case BACKSL|'n':
938                                         ordinary(p, '\n');
939                                         break;
940                               case BACKSL|'r':
941                                         ordinary(p, '\r');
942                                         break;
943                               case BACKSL|'t':
944                                         ordinary(p, '\t');
945                                         break;
946                               case BACKSL|'v':
947                                         ordinary(p, '\v');
948                                         break;
949                               default:
950                                         handled = false;
951                               }
952                     }
953 #endif
954           }
955           if (!handled) {
956                     switch (c) {
957                     case '.':
958                               if (p->g->cflags&REG_NEWLINE)
959                                         nonnewline(p);
960                               else
961                                         EMIT(OANY, 0);
962                               break;
963                     case '[':
964                               p_bracket(p);
965                               break;
966                     case BACKSL|'<':
967                               EMIT(OBOW, 0);
968                               break;
969                     case BACKSL|'>':
970                               EMIT(OEOW, 0);
971                               break;
972                     case BACKSL|'{':
973                               SETERROR(REG_BADRPT);
974                               break;
975                     case BACKSL|'(':
976                               p->g->nsub++;
977                               subno = (sopno)p->g->nsub;
978                               if (subno < NPAREN)
979                                         p->pbegin[subno] = HERE();
980                               EMIT(OLPAREN, subno);
981                               /* the MORE here is an error heuristic */
982                               if (MORE() && !SEETWO('\\', ')'))
983                                         p_re(p, '\\', ')');
984                               if (subno < NPAREN) {
985                                         p->pend[subno] = HERE();
986                                         assert(p->pend[subno] != 0);
987                               }
988                               EMIT(ORPAREN, subno);
989                               (void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
990                               break;
991                     case BACKSL|')':    /* should not get here -- must be user */
992                               SETERROR(REG_EPAREN);
993                               break;
994                     case BACKSL|'1':
995                     case BACKSL|'2':
996                     case BACKSL|'3':
997                     case BACKSL|'4':
998                     case BACKSL|'5':
999                     case BACKSL|'6':
1000                     case BACKSL|'7':
1001                     case BACKSL|'8':
1002                     case BACKSL|'9':
1003                               i = (c&~BACKSL) - '0';
1004                               assert(i < NPAREN);
1005                               if (p->pend[i] != 0) {
1006                                         assert(i <= p->g->nsub);
1007                                         EMIT(OBACK_, i);
1008                                         assert(p->pbegin[i] != 0);
1009                                         assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
1010                                         assert(OP(p->strip[p->pend[i]]) == ORPAREN);
1011                                         (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
1012                                         EMIT(O_BACK, i);
1013                               } else
1014                                         SETERROR(REG_ESUBREG);
1015                               p->g->backrefs = 1;
1016                               break;
1017                     case '*':
1018                               /*
1019                                * Ordinary if used as the first character beyond BOL anchor of
1020                                * a (sub-)expression, counts as a bad repetition operator if it
1021                                * appears otherwise.
1022                                */
1023                               (void)REQUIRE(bc->nchain == 0, REG_BADRPT);
1024                               /* FALLTHROUGH */
1025                     default:
1026                               if (p->error != 0)
1027                                         return (false);     /* Definitely not $... */
1028                               p->next--;
1029                               wc = WGETNEXT();
1030                               if ((c & BACKSL) == 0 || may_escape(p, wc))
1031                                         ordinary(p, wc);
1032                               else
1033                                         SETERROR(REG_EESCAPE);
1034                               break;
1035                     }
1036           }
1037 
1038           if (EAT('*')) {               /* implemented as +? */
1039                     /* this case does not require the (y|) trick, noKLUDGE */
1040                     INSERT(OPLUS_, pos);
1041                     ASTERN(O_PLUS, pos);
1042                     INSERT(OQUEST_, pos);
1043                     ASTERN(O_QUEST, pos);
1044 #ifdef REGEX_GNU_EXTENSIONS
1045           } else if (p->gnuext && EATTWO('\\', '?')) {
1046                     INSERT(OQUEST_, pos);
1047                     ASTERN(O_QUEST, pos);
1048           } else if (p->gnuext && EATTWO('\\', '+')) {
1049                     INSERT(OPLUS_, pos);
1050                     ASTERN(O_PLUS, pos);
1051 #endif
1052           } else if (EATTWO('\\', '{')) {
1053                     count = p_count(p);
1054                     if (EAT(',')) {
1055                               if (MORE() && isdigit((uch)PEEK())) {
1056                                         count2 = p_count(p);
1057                                         (void)REQUIRE(count <= count2, REG_BADBR);
1058                               } else              /* single number with comma */
1059                                         count2 = INFINITY;
1060                     } else              /* just a single number */
1061                               count2 = count;
1062                     repeat(p, pos, count, count2);
1063                     if (!EATTWO('\\', '}')) {     /* error heuristics */
1064                               while (MORE() && !SEETWO('\\', '}'))
1065                                         NEXT();
1066                               (void)REQUIRE(MORE(), REG_EBRACE);
1067                               SETERROR(REG_BADBR);
1068                     }
1069           } else if (c == '$')     /* $ (but not \$) ends it */
1070                     return (true);
1071 
1072           return (false);
1073 }
1074 
1075 /*
1076  - p_count - parse a repetition count
1077  == static int p_count(struct parse *p);
1078  */
1079 static int                              /* the value */
p_count(struct parse * p)1080 p_count(struct parse *p)
1081 {
1082           int count = 0;
1083           int ndigits = 0;
1084 
1085           while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
1086                     count = count*10 + ((uch)GETNEXT() - '0');
1087                     ndigits++;
1088           }
1089 
1090           (void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
1091           return(count);
1092 }
1093 
1094 /*
1095  - p_bracket - parse a bracketed character list
1096  == static void p_bracket(struct parse *p);
1097  */
1098 static void
p_bracket(struct parse * p)1099 p_bracket(struct parse *p)
1100 {
1101           cset *cs;
1102           wint_t ch;
1103 
1104           /* Dept of Truly Sickening Special-Case Kludges */
1105           if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
1106                     EMIT(OBOW, 0);
1107                     NEXTn(6);
1108                     return;
1109           }
1110           if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
1111                     EMIT(OEOW, 0);
1112                     NEXTn(6);
1113                     return;
1114           }
1115 
1116           if ((cs = allocset(p)) == NULL)
1117                     return;
1118 
1119           if (p->g->cflags&REG_ICASE)
1120                     cs->icase = 1;
1121           if (EAT('^'))
1122                     cs->invert = 1;
1123           if (EAT(']'))
1124                     CHadd(p, cs, ']');
1125           else if (EAT('-'))
1126                     CHadd(p, cs, '-');
1127           while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
1128                     p_b_term(p, cs);
1129           if (EAT('-'))
1130                     CHadd(p, cs, '-');
1131           (void)MUSTEAT(']', REG_EBRACK);
1132 
1133           if (p->error != 0)  /* don't mess things up further */
1134                     return;
1135 
1136           if (cs->invert && p->g->cflags&REG_NEWLINE)
1137                     cs->bmp['\n' >> 3] |= 1 << ('\n' & 7);
1138 
1139           if ((ch = singleton(cs)) != OUT) {      /* optimize singleton sets */
1140                     ordinary(p, ch);
1141                     freeset(p, cs);
1142           } else
1143                     EMIT(OANYOF, (size_t)(cs - p->g->sets));
1144 }
1145 
1146 static int
p_range_cmp(wchar_t c1,wchar_t c2)1147 p_range_cmp(wchar_t c1, wchar_t c2)
1148 {
1149 #ifdef REGEX_LIBC_COLLATE
1150           return __wcollate_range_cmp(c1, c2);
1151 #elif defined(NLS)
1152           /* Copied from libc/collate __wcollate_range_cmp */
1153           wchar_t s1[2], s2[2];
1154 
1155           s1[0] = c1;
1156           s1[1] = L'\0';
1157           s2[0] = c2;
1158           s2[1] = L'\0';
1159           return wcscoll(s1, s2);
1160 #else
1161           char s1[2], s2[2];
1162 
1163           s1[0] = (char)c1;
1164           s1[1] = '\0';
1165           s2[0] = (char)c2;
1166           s2[1] = '\0';
1167           return strcoll(s1, s2);
1168 #endif
1169 }
1170 
1171 /*
1172  - p_b_term - parse one term of a bracketed character list
1173  == static void p_b_term(struct parse *p, cset *cs);
1174  */
1175 static void
p_b_term(struct parse * p,cset * cs)1176 p_b_term(struct parse *p, cset *cs)
1177 {
1178           char c;
1179           wint_t start, finish;
1180           wint_t i;
1181 #ifdef REGEX_LIBC_COLLATE
1182           struct xlocale_collate *table =
1183                     (struct xlocale_collate*)__get_locale()->components[XLC_COLLATE];
1184 #endif
1185 
1186           _DIAGASSERT(p != NULL);
1187           _DIAGASSERT(cs != NULL);
1188 
1189           /* classify what we've got */
1190           switch ((MORE()) ? PEEK() : '\0') {
1191           case '[':
1192                     c = (MORE2()) ? PEEK2() : '\0';
1193                     break;
1194           case '-':
1195                     SETERROR(REG_ERANGE);
1196                     return;                       /* NOTE RETURN */
1197           default:
1198                     c = '\0';
1199                     break;
1200           }
1201 
1202           switch (c) {
1203           case ':':           /* character class */
1204                     NEXT2();
1205                     (void)REQUIRE(MORE(), REG_EBRACK);
1206                     c = PEEK();
1207                     (void)REQUIRE(c != '-' && c != ']', REG_ECTYPE);
1208                     p_b_cclass(p, cs);
1209                     (void)REQUIRE(MORE(), REG_EBRACK);
1210                     (void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
1211                     break;
1212           case '=':           /* equivalence class */
1213                     NEXT2();
1214                     (void)REQUIRE(MORE(), REG_EBRACK);
1215                     c = PEEK();
1216                     (void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
1217                     p_b_eclass(p, cs);
1218                     (void)REQUIRE(MORE(), REG_EBRACK);
1219                     (void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
1220                     break;
1221           default:            /* symbol, ordinary character, or range */
1222                     start = p_b_symbol(p);
1223                     if (SEE('-') && MORE2() && PEEK2() != ']') {
1224                               /* range */
1225                               NEXT();
1226                               if (EAT('-'))
1227                                         finish = '-';
1228                               else
1229                                         finish = p_b_symbol(p);
1230                     } else
1231                               finish = start;
1232                     if (start == finish)
1233                               CHadd(p, cs, start);
1234                     else {
1235 #ifdef REGEX_LIBC_COLLATE
1236                               if (table->__collate_load_error || MB_CUR_MAX > 1) {
1237 #else
1238                               if (MB_CUR_MAX > 1) {
1239 #endif
1240                                         (void)REQUIRE(start <= finish, REG_ERANGE);
1241                                         CHaddrange(p, cs, start, finish);
1242                               } else {
1243                                         (void)REQUIRE(p_range_cmp(start, finish) <= 0, REG_ERANGE);
1244                                         for (i = 0; i <= UCHAR_MAX; i++) {
1245                                                   if (p_range_cmp(start, i) <= 0 &&
1246                                                       p_range_cmp(i, finish) <= 0 )
1247                                                             CHadd(p, cs, i);
1248                                         }
1249                               }
1250                     }
1251                     break;
1252           }
1253 }
1254 
1255 #ifdef REGEX_GNU_EXTENSIONS
1256 /*
1257  - p_b_pseudoclass - parse a pseudo-class (\w, \W, \s, \S)
1258  == static int p_b_pseudoclass(struct parse *p, char c)
1259  */
1260 static int
1261 p_b_pseudoclass(struct parse *p, char c) {
1262           cset *cs;
1263 
1264           if ((cs = allocset(p)) == NULL)
1265                     return(0);
1266 
1267           if (p->g->cflags&REG_ICASE)
1268                     cs->icase = 1;
1269 
1270           switch (c) {
1271           case 'W':
1272                     cs->invert = 1;
1273                     /* FALLTHROUGH */
1274           case 'w':
1275                     p_b_cclass_named(p, cs, "alnum");
1276                     break;
1277           case 'S':
1278                     cs->invert = 1;
1279                     /* FALLTHROUGH */
1280           case 's':
1281                     p_b_cclass_named(p, cs, "space");
1282                     break;
1283           default:
1284                     return(0);
1285           }
1286 
1287           EMIT(OANYOF, (size_t)(cs - p->g->sets));
1288           return(1);
1289 }
1290 #endif
1291 
1292 /*
1293  - p_b_cclass - parse a character-class name and deal with it
1294  == static void p_b_cclass(struct parse *p, cset *cs);
1295  */
1296 static void
1297 p_b_cclass(struct parse *p, cset *cs)
1298 {
1299           const char *sp = p->next;
1300           size_t len;
1301           char clname[16];
1302 
1303           while (MORE() && isalpha((uch)PEEK()))
1304                     NEXT();
1305           len = p->next - sp;
1306           if (len >= sizeof(clname) - 1) {
1307                     SETERROR(REG_ECTYPE);
1308                     return;
1309           }
1310           memcpy(clname, sp, len);
1311           clname[len] = '\0';
1312 
1313           p_b_cclass_named(p, cs, clname);
1314 }
1315 
1316 /*
1317  - p_b_cclass_named - deal with a named character class
1318  == static void p_b_cclass_named(struct parse *p, cset *cs, const char []);
1319  */
1320 static void
1321 p_b_cclass_named(struct parse *p, cset *cs, const char clname[]) {
1322           wctype_t wct;
1323 
1324           if ((wct = wctype(clname)) == 0) {
1325                     SETERROR(REG_ECTYPE);
1326                     return;
1327           }
1328           CHaddtype(p, cs, wct);
1329 }
1330 
1331 /*
1332  - p_b_eclass - parse an equivalence-class name and deal with it
1333  == static void p_b_eclass(struct parse *p, cset *cs);
1334  *
1335  * This implementation is incomplete. xxx
1336  */
1337 static void
1338 p_b_eclass(struct parse *p, cset *cs)
1339 {
1340           wint_t c;
1341 
1342           _DIAGASSERT(p != NULL);
1343           _DIAGASSERT(cs != NULL);
1344 
1345           c = p_b_coll_elem(p, '=');
1346           CHadd(p, cs, c);
1347 }
1348 
1349 /*
1350  - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
1351  == static wint_t p_b_symbol(struct parse *p);
1352  */
1353 static wint_t                           /* value of symbol */
1354 p_b_symbol(struct parse *p)
1355 {
1356           wint_t value;
1357 
1358           _DIAGASSERT(p != NULL);
1359 
1360           (void)REQUIRE(MORE(), REG_EBRACK);
1361           if (!EATTWO('[', '.'))
1362                     return(WGETNEXT());
1363 
1364           /* collating symbol */
1365           value = p_b_coll_elem(p, '.');
1366           (void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
1367           return(value);
1368 }
1369 
1370 /*
1371  - p_b_coll_elem - parse a collating-element name and look it up
1372  == static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
1373  */
1374 static wint_t                           /* value of collating element */
1375 p_b_coll_elem(struct parse *p,
1376           wint_t endc)                  /* name ended by endc,']' */
1377 {
1378           const char *sp = p->next;
1379           struct cname *cp;
1380           size_t len;
1381 
1382           _DIAGASSERT(p != NULL);
1383 
1384           while (MORE() && !SEETWO(endc, ']'))
1385                     NEXT();
1386           if (!MORE()) {
1387                     SETERROR(REG_EBRACK);
1388                     return(0);
1389           }
1390           len = p->next - sp;
1391           for (cp = cnames; cp->name != NULL; cp++)
1392                     if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len)
1393                               return(cp->code);   /* known name */
1394 #ifdef NLS
1395           mbstate_t mbs;
1396           wchar_t wc;
1397           size_t clen;
1398 
1399           memset(&mbs, 0, sizeof(mbs));
1400           if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len)
1401                     return (wc);                            /* single character */
1402           else if (clen == (size_t)-1 || clen == (size_t)-2)
1403                     SETERROR(REG_ILLSEQ);
1404           else
1405                     SETERROR(REG_ECOLLATE);                 /* neither */
1406           return(0);
1407 #else
1408           if (len == 1)
1409                     return *sp;    /* single character */
1410           SETERROR(REG_ECOLLATE);                 /* neither */
1411           return 0;
1412 #endif
1413 }
1414 
1415 /*
1416  - may_escape - determine whether 'ch' is escape-able in the current context
1417  == static int may_escape(struct parse *p, const wint_t ch)
1418  */
1419 static bool
1420 may_escape(struct parse *p, const wint_t ch)
1421 {
1422 
1423           if ((p->pflags & PFLAG_LEGACY_ESC) != 0)
1424                     return (true);
1425           if (iswalpha(ch) || ch == '\'' || ch == '`')
1426                     return (false);
1427           return (true);
1428 #ifdef NOTYET
1429           /*
1430            * Build a whitelist of characters that may be escaped to produce an
1431            * ordinary in the current context. This assumes that these have not
1432            * been otherwise interpreted as a special character. Escaping an
1433            * ordinary character yields undefined results according to
1434            * IEEE 1003.1-2008. Some extensions (notably, some GNU extensions) take
1435            * advantage of this and use escaped ordinary characters to provide
1436            * special meaning, e.g. \b, \B, \w, \W, \s, \S.
1437            */
1438           switch(ch) {
1439           case '|':
1440           case '+':
1441           case '?':
1442                     /* The above characters may not be escaped in BREs */
1443                     if (!(p->g->cflags&REG_EXTENDED))
1444                               return (false);
1445                     /* Fallthrough */
1446           case '(':
1447           case ')':
1448           case '{':
1449           case '}':
1450           case '.':
1451           case '[':
1452           case ']':
1453           case '\\':
1454           case '*':
1455           case '^':
1456           case '$':
1457                     return (true);
1458           default:
1459                     return (false);
1460           }
1461 #endif
1462 }
1463 
1464 /*
1465  - othercase - return the case counterpart of an alphabetic
1466  == static wint_t othercase(wint_t ch);
1467  */
1468 static wint_t                           /* if no counterpart, return ch */
1469 othercase(wint_t ch)
1470 {
1471           assert(iswalpha(ch));
1472           if (iswupper(ch))
1473                     return(towlower(ch));
1474           else if (iswlower(ch))
1475                     return(towupper(ch));
1476           else                          /* peculiar, but could happen */
1477                     return(ch);
1478 }
1479 
1480 /*
1481  - bothcases - emit a dualcase version of a two-case character
1482  == static void bothcases(struct parse *p, wint_t ch);
1483  *
1484  * Boy, is this implementation ever a kludge...
1485  */
1486 static void
1487 bothcases(struct parse *p, wint_t ch)
1488 {
1489           const char *oldnext = p->next;
1490           const char *oldend = p->end;
1491           char bracket[3 + MB_LEN_MAX];
1492           size_t n;
1493 
1494           _DIAGASSERT(p != NULL);
1495 
1496           assert(othercase(ch) != ch);  /* p_bracket() would recurse */
1497           p->next = bracket;
1498 #ifdef NLS
1499           mbstate_t mbs;
1500           memset(&mbs, 0, sizeof(mbs));
1501           n = wcrtomb(bracket, ch, &mbs);
1502           assert(n != (size_t)-1);
1503 #else
1504           n = 0;
1505           bracket[n++] = ch;
1506 #endif
1507           bracket[n] = ']';
1508           bracket[n + 1] = '\0';
1509           p->end = bracket+n+1;
1510           p_bracket(p);
1511           assert(p->next == p->end);
1512           p->next = oldnext;
1513           p->end = oldend;
1514 }
1515 
1516 /*
1517  - ordinary - emit an ordinary character
1518  == static void ordinary(struct parse *p, wint_t ch);
1519  */
1520 static void
1521 ordinary(struct parse *p, wint_t ch)
1522 {
1523           cset *cs;
1524 
1525           _DIAGASSERT(p != NULL);
1526 
1527           if ((p->g->cflags&REG_ICASE) && iswalpha(ch) && othercase(ch) != ch)
1528                     bothcases(p, ch);
1529           else if ((wint_t)(ch & OPDMASK) == ch)
1530                     EMIT(OCHAR, (size_t)ch);
1531           else {
1532                     /*
1533                      * Kludge: character is too big to fit into an OCHAR operand.
1534                      * Emit a singleton set.
1535                      */
1536                     if ((cs = allocset(p)) == NULL)
1537                               return;
1538                     CHadd(p, cs, ch);
1539                     EMIT(OANYOF, (size_t)(cs - p->g->sets));
1540           }
1541 }
1542 
1543 /*
1544  - nonnewline - emit REG_NEWLINE version of OANY
1545  == static void nonnewline(struct parse *p);
1546  *
1547  * Boy, is this implementation ever a kludge...
1548  */
1549 static void
1550 nonnewline(struct parse *p)
1551 {
1552           const char *oldnext = p->next;
1553           const char *oldend = p->end;
1554           char bracket[4];
1555 
1556           _DIAGASSERT(p != NULL);
1557 
1558           p->next = bracket;
1559           p->end = bracket+3;
1560           bracket[0] = '^';
1561           bracket[1] = '\n';
1562           bracket[2] = ']';
1563           bracket[3] = '\0';
1564           p_bracket(p);
1565           assert(p->next == bracket+3);
1566           p->next = oldnext;
1567           p->end = oldend;
1568 }
1569 
1570 /*
1571  - repeat - generate code for a bounded repetition, recursively if needed
1572  == static void repeat(struct parse *p, sopno start, int from, int to);
1573  */
1574 static void
1575 repeat(struct parse *p,
1576           sopno start,                  /* operand from here to end of strip */
1577           int from,           /* repeated from this number */
1578           int to)                       /* to this number of times (maybe INFINITY) */
1579 {
1580           sopno finish = HERE();
1581 #         define    N         2
1582 #         define    INF       3
1583 #         define    REP(f, t) ((f)*8 + (t))
1584 #         define    MAP(n)    (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1585           sopno copy;
1586 
1587           _DIAGASSERT(p != NULL);
1588 
1589           if (p->error != 0)  /* head off possible runaway recursion */
1590                     return;
1591 
1592           assert(from <= to);
1593 
1594           switch (REP(MAP(from), MAP(to))) {
1595           case REP(0, 0):                         /* must be user doing this */
1596                     DROP(finish-start); /* drop the operand */
1597                     break;
1598           case REP(0, 1):                         /* as x{1,1}? */
1599           case REP(0, N):                         /* as x{1,n}? */
1600           case REP(0, INF):             /* as x{1,}? */
1601                     /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1602                     INSERT(OCH_, start);                    /* offset is wrong... */
1603                     repeat(p, start+1, 1, to);
1604                     ASTERN(OOR1, start);
1605                     AHEAD(start);                           /* ... fix it */
1606                     EMIT(OOR2, 0);
1607                     AHEAD(THERE());
1608                     ASTERN(O_CH, THERETHERE());
1609                     break;
1610           case REP(1, 1):                         /* trivial case */
1611                     /* done */
1612                     break;
1613           case REP(1, N):                         /* as x?x{1,n-1} */
1614                     /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1615                     INSERT(OCH_, start);
1616                     ASTERN(OOR1, start);
1617                     AHEAD(start);
1618                     EMIT(OOR2, 0);                          /* offset very wrong... */
1619                     AHEAD(THERE());                         /* ...so fix it */
1620                     ASTERN(O_CH, THERETHERE());
1621                     copy = dupl(p, start+1, finish+1);
1622                     assert(copy == finish+4);
1623                     repeat(p, copy, 1, to-1);
1624                     break;
1625           case REP(1, INF):             /* as x+ */
1626                     INSERT(OPLUS_, start);
1627                     ASTERN(O_PLUS, start);
1628                     break;
1629           case REP(N, N):                         /* as xx{m-1,n-1} */
1630                     copy = dupl(p, start, finish);
1631                     repeat(p, copy, from-1, to-1);
1632                     break;
1633           case REP(N, INF):             /* as xx{n-1,INF} */
1634                     copy = dupl(p, start, finish);
1635                     repeat(p, copy, from-1, to);
1636                     break;
1637           default:                      /* "can't happen" */
1638                     SETERROR(REG_ASSERT);         /* just in case */
1639                     break;
1640           }
1641 }
1642 
1643 /*
1644  - wgetnext - helper function for WGETNEXT() macro. Gets the next wide
1645  - character from the parse struct, signals a REG_ILLSEQ error if the
1646  - character can't be converted. Returns the number of bytes consumed.
1647  */
1648 static wint_t
1649 wgetnext(struct parse *p)
1650 {
1651 #ifdef NLS
1652           mbstate_t mbs;
1653           wchar_t wc;
1654           size_t n;
1655 
1656           memset(&mbs, 0, sizeof(mbs));
1657           n = mbrtowc(&wc, p->next, (size_t)(p->end - p->next), &mbs);
1658           if (n == (size_t)-1 || n == (size_t)-2) {
1659                     SETERROR(REG_ILLSEQ);
1660                     return (0);
1661           }
1662           if (n == 0)
1663                     n = 1;
1664           p->next += n;
1665           return wc;
1666 #else
1667           return *p->next++;
1668 #endif
1669 }
1670 
1671 /*
1672  - seterr - set an error condition
1673  == static int seterr(struct parse *p, int e);
1674  */
1675 static int                              /* useless but makes type checking happy */
1676 seterr(struct parse *p, int e)
1677 {
1678 
1679           _DIAGASSERT(p != NULL);
1680 
1681           if (p->error == 0)  /* keep earliest error condition */
1682                     p->error = e;
1683           p->next = nuls;               /* try to bring things to a halt */
1684           p->end = nuls;
1685           return(0);                    /* make the return value well-defined */
1686 }
1687 
1688 /*
1689  - allocset - allocate a set of characters for []
1690  == static cset *allocset(struct parse *p);
1691  */
1692 static cset *
1693 allocset(struct parse *p)
1694 {
1695           cset *cs, *ncs;
1696 
1697           _DIAGASSERT(p != NULL);
1698 
1699           ncs = reallocarray(p->g->sets, p->g->ncsets + 1, sizeof(*ncs));
1700           if (ncs == NULL) {
1701                     SETERROR(REG_ESPACE);
1702                     return (NULL);
1703           }
1704           p->g->sets = ncs;
1705           cs = &p->g->sets[p->g->ncsets++];
1706           memset(cs, 0, sizeof(*cs));
1707 
1708           return(cs);
1709 }
1710 
1711 /*
1712  - freeset - free a now-unused set
1713  == static void freeset(struct parse *p, cset *cs);
1714  */
1715 static void
1716 freeset(struct parse *p, cset *cs)
1717 {
1718           cset *top;
1719 
1720           _DIAGASSERT(p != NULL);
1721           _DIAGASSERT(cs != NULL);
1722 
1723           top = &p->g->sets[p->g->ncsets];
1724 
1725           free(cs->wides);
1726           free(cs->ranges);
1727           free(cs->types);
1728           memset(cs, 0, sizeof(*cs));
1729           if (cs == top-1)    /* recover only the easy case */
1730                     p->g->ncsets--;
1731 }
1732 
1733 /*
1734  - singleton - Determine whether a set contains only one character,
1735  - returning it if so, otherwise returning OUT.
1736  */
1737 static wint_t
1738 singleton(cset *cs)
1739 {
1740           wint_t i, s, n;
1741 
1742           for (i = n = 0; i < NC; i++)
1743                     if (CHIN(cs, i)) {
1744                               n++;
1745                               s = i;
1746                     }
1747           if (n == 1)
1748                     return (s);
1749           if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 &&
1750               cs->icase == 0)
1751                     return (cs->wides[0]);
1752           /* Don't bother handling the other cases. */
1753           return (OUT);
1754 }
1755 
1756 /*
1757  - CHadd - add character to character set.
1758  */
1759 static void
1760 CHadd(struct parse *p, cset *cs, wint_t ch)
1761 {
1762           wint_t nch, *newwides;
1763 
1764           _DIAGASSERT(p != NULL);
1765           _DIAGASSERT(cs != NULL);
1766 
1767           if ((unsigned)ch < NC)
1768                     cs->bmp[(unsigned)ch >> 3] |= 1 << (ch & 7);
1769           else {
1770                     newwides = reallocarray(cs->wides, cs->nwides + 1,
1771                         sizeof(*cs->wides));
1772                     if (newwides == NULL) {
1773                               SETERROR(REG_ESPACE);
1774                               return;
1775                     }
1776                     cs->wides = newwides;
1777                     cs->wides[cs->nwides++] = ch;
1778           }
1779           if (cs->icase) {
1780                     if ((unsigned)(nch = towlower(ch)) < NC)
1781                               cs->bmp[(unsigned)nch >> 3] |= 1 << (nch & 7);
1782                     if ((unsigned)(nch = towupper(ch)) < NC)
1783                               cs->bmp[(unsigned)nch >> 3] |= 1 << (nch & 7);
1784           }
1785 }
1786 
1787 /*
1788  - CHaddrange - add all characters in the range [min,max] to a character set.
1789  */
1790 static void
1791 CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max)
1792 {
1793           crange *newranges;
1794 
1795           _DIAGASSERT(p != NULL);
1796           _DIAGASSERT(cs != NULL);
1797 
1798           for (; min < NC && min <= max; min++)
1799                     CHadd(p, cs, min);
1800           if (min >= max)
1801                     return;
1802           newranges = reallocarray(cs->ranges, cs->nranges + 1,
1803               sizeof(*cs->ranges));
1804           if (newranges == NULL) {
1805                     SETERROR(REG_ESPACE);
1806                     return;
1807           }
1808           cs->ranges = newranges;
1809           cs->ranges[cs->nranges].min = min;
1810           cs->ranges[cs->nranges].max = max;
1811           cs->nranges++;
1812 }
1813 
1814 /*
1815  - CHaddtype - add all characters of a certain type to a character set.
1816  */
1817 static void
1818 CHaddtype(struct parse *p, cset *cs, wctype_t wct)
1819 {
1820           wint_t i;
1821           wctype_t *newtypes;
1822 
1823           _DIAGASSERT(p != NULL);
1824           _DIAGASSERT(cs != NULL);
1825 
1826           for (i = 0; i < NC; i++)
1827                     if (iswctype(i, wct))
1828                               CHadd(p, cs, i);
1829           newtypes = reallocarray(cs->types, cs->ntypes + 1,
1830               sizeof(*cs->types));
1831           if (newtypes == NULL) {
1832                     SETERROR(REG_ESPACE);
1833                     return;
1834           }
1835           cs->types = newtypes;
1836           cs->types[cs->ntypes++] = wct;
1837 }
1838 
1839 /*
1840  - dupl - emit a duplicate of a bunch of sops
1841  == static sopno dupl(struct parse *p, sopno start, sopno finish);
1842  */
1843 static sopno                            /* start of duplicate */
1844 dupl(struct parse *p,
1845           sopno start,                  /* from here */
1846           sopno finish)                 /* to this less one */
1847 {
1848           sopno ret = HERE();
1849           sopno len = finish - start;
1850 
1851           _DIAGASSERT(p != NULL);
1852 
1853           assert(finish >= start);
1854           if (len == 0)
1855                     return(ret);
1856           if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */
1857                     return(ret);
1858           (void) memcpy(p->strip + p->slen,
1859               p->strip + start, len * sizeof(*p->strip));
1860           p->slen += len;
1861           return(ret);
1862 }
1863 
1864 /*
1865  - doemit - emit a strip operator
1866  == static void doemit(struct parse *p, sop op, size_t opnd);
1867  *
1868  * It might seem better to implement this as a macro with a function as
1869  * hard-case backup, but it's just too big and messy unless there are
1870  * some changes to the data structures.  Maybe later.
1871  */
1872 static void
1873 doemit(struct parse *p, sop op, size_t opnd)
1874 {
1875           /* avoid making error situations worse */
1876           if (p->error != 0)
1877                     return;
1878 
1879           _DIAGASSERT(p != NULL);
1880 
1881           /* deal with oversize operands ("can't happen", more or less) */
1882           assert(opnd < 1<<OPSHIFT);
1883 
1884           /* deal with undersized strip */
1885           if (p->slen >= p->ssize)
1886                     if (!enlarge(p, (p->ssize+1) / 2 * 3))  /* +50% */
1887                               return;
1888 
1889           /* finally, it's all reduced to the easy case */
1890           p->strip[p->slen++] = (sopno)SOP(op, opnd);
1891 }
1892 
1893 /*
1894  - doinsert - insert a sop into the strip
1895  == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
1896  */
1897 static void
1898 doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
1899 {
1900           sopno sn;
1901           sop s;
1902           int i;
1903 
1904           _DIAGASSERT(p != NULL);
1905 
1906           /* avoid making error situations worse */
1907           if (p->error != 0)
1908                     return;
1909 
1910           sn = HERE();
1911           EMIT(op, opnd);               /* do checks, ensure space */
1912           assert(HERE() == sn+1);
1913           s = p->strip[sn];
1914 
1915           /* adjust paren pointers */
1916           assert(pos > 0);
1917           for (i = 1; i < NPAREN; i++) {
1918                     if (p->pbegin[i] >= pos) {
1919                               p->pbegin[i]++;
1920                     }
1921                     if (p->pend[i] >= pos) {
1922                               p->pend[i]++;
1923                     }
1924           }
1925 
1926           memmove(&p->strip[pos+1], &p->strip[pos],
1927               (HERE()-pos-1)*sizeof(*p->strip));
1928           p->strip[pos] = s;
1929 }
1930 
1931 /*
1932  - dofwd - complete a forward reference
1933  == static void dofwd(struct parse *p, sopno pos, sop value);
1934  */
1935 static void
1936 dofwd(struct parse *p, sopno pos, sop value)
1937 {
1938 
1939           _DIAGASSERT(p != NULL);
1940 
1941           /* avoid making error situations worse */
1942           if (p->error != 0)
1943                     return;
1944 
1945           assert(value < 1<<OPSHIFT);
1946           p->strip[pos] = OP(p->strip[pos]) | value;
1947 }
1948 
1949 /*
1950  - enlarge - enlarge the strip
1951  == static int enlarge(struct parse *p, sopno size);
1952  */
1953 static int
1954 enlarge(struct parse *p, sopno size)
1955 {
1956           sop *sp;
1957 
1958           _DIAGASSERT(p != NULL);
1959 
1960           if (p->ssize >= size)
1961                     return 1;
1962 
1963           sp = reallocarray(p->strip, size, sizeof(*p->strip));
1964           if (sp == NULL) {
1965                     SETERROR(REG_ESPACE);
1966                     return 0;
1967           }
1968           p->strip = sp;
1969           p->ssize = size;
1970           return 1;
1971 }
1972 
1973 /*
1974  - stripsnug - compact the strip
1975  == static void stripsnug(struct parse *p, struct re_guts *g);
1976  */
1977 static void
1978 stripsnug(struct parse *p, struct re_guts *g)
1979 {
1980 
1981           _DIAGASSERT(p != NULL);
1982           _DIAGASSERT(g != NULL);
1983 
1984           g->nstates = p->slen;
1985           g->strip = reallocarray(p->strip, p->slen, sizeof(*p->strip));
1986           if (g->strip == NULL) {
1987                     SETERROR(REG_ESPACE);
1988                     g->strip = p->strip;
1989           }
1990 }
1991 
1992 /*
1993  - findmust - fill in must and mlen with longest mandatory literal string
1994  == static void findmust(struct parse *p, struct re_guts *g);
1995  *
1996  * This algorithm could do fancy things like analyzing the operands of |
1997  * for common subsequences.  Someday.  This code is simple and finds most
1998  * of the interesting cases.
1999  *
2000  * Note that must and mlen got initialized during setup.
2001  */
2002 static void
2003 findmust(struct parse *p, struct re_guts *g)
2004 {
2005           sop *scan;
2006           sop *start = NULL;
2007           sop *newstart = NULL;
2008           sopno newlen;
2009           sop s;
2010           char *cp;
2011           int offset;
2012           mbstate_t mbs;
2013 
2014           _DIAGASSERT(p != NULL);
2015           _DIAGASSERT(g != NULL);
2016 
2017           /* avoid making error situations worse */
2018           if (p->error != 0)
2019                     return;
2020 
2021 #ifdef notyet
2022           /*
2023            * It's not generally safe to do a ``char'' substring search on
2024            * multibyte character strings, but it's safe for at least
2025            * UTF-8 (see RFC 3629).
2026            */
2027           if (MB_CUR_MAX > 1 &&
2028               strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0)
2029                     return;
2030 #endif
2031 
2032           /* find the longest OCHAR sequence in strip */
2033           newlen = 0;
2034           offset = 0;
2035           g->moffset = 0;
2036           scan = g->strip + 1;
2037           do {
2038                     s = *scan++;
2039                     switch (OP(s)) {
2040                     case OCHAR:                   /* sequence member */
2041                               if (newlen == 0) {            /* new sequence */
2042                                         memset(&mbs, 0, sizeof(mbs));
2043                                         newstart = scan - 1;
2044                               }
2045 #ifdef NLS
2046                               char buf[MB_LEN_MAX];
2047                               size_t clen = wcrtomb(buf, (int)OPND(s), &mbs);
2048                               if (clen == (size_t)-1)
2049                                         goto toohard;
2050                               newlen += (sopno)clen;
2051 #else
2052                               newlen++;
2053 #endif
2054                               break;
2055                     case OPLUS_:                  /* things that don't break one */
2056                     case OLPAREN:
2057                     case ORPAREN:
2058                               break;
2059                     case OQUEST_:                 /* things that must be skipped */
2060                     case OCH_:
2061                               offset = altoffset(scan, offset);
2062                               scan--;
2063                               do {
2064                                         scan += OPND(s);
2065                                         s = *scan;
2066                                         /* assert() interferes w debug printouts */
2067                                         if (OP(s) != O_QUEST &&
2068                                             OP(s) != O_CH && OP(s) != OOR2) {
2069                                                   g->iflags |= BAD;
2070                                                   return;
2071                                         }
2072                               } while (OP(s) != O_QUEST && OP(s) != O_CH);
2073                               /* FALLTHROUGH */
2074                     case OBOW:                    /* things that break a sequence */
2075                     case OEOW:
2076                     case OBOL:
2077                     case OEOL:
2078                     case OBOS:
2079                     case OEOS:
2080                     case OWBND:
2081                     case ONWBND:
2082                     case O_QUEST:
2083                     case O_CH:
2084                     case OEND:
2085                               if (newlen > (sopno)g->mlen) {                    /* ends one */
2086                                         start = newstart;
2087                                         g->mlen = newlen;
2088                                         if (offset > -1) {
2089                                                   g->moffset += offset;
2090                                                   offset = newlen;
2091                                         } else
2092                                                   g->moffset = offset;
2093                               } else {
2094                                         if (offset > -1)
2095                                                   offset += newlen;
2096                               }
2097                               newlen = 0;
2098                               break;
2099                     case OANY:
2100                               if (newlen > (sopno)g->mlen) {                    /* ends one */
2101                                         start = newstart;
2102                                         g->mlen = newlen;
2103                                         if (offset > -1) {
2104                                                   g->moffset += offset;
2105                                                   offset = newlen;
2106                                         } else
2107                                                   g->moffset = offset;
2108                               } else {
2109                                         if (offset > -1)
2110                                                   offset += newlen;
2111                               }
2112                               if (offset > -1)
2113                                         offset++;
2114                               newlen = 0;
2115                               break;
2116                     case OANYOF:                  /* may or may not invalidate offset */
2117                               /* First, everything as OANY */
2118                               if (newlen > (sopno)g->mlen) {                    /* ends one */
2119                                         start = newstart;
2120                                         g->mlen = newlen;
2121                                         if (offset > -1) {
2122                                                   g->moffset += offset;
2123                                                   offset = newlen;
2124                                         } else
2125                                                   g->moffset = offset;
2126                               } else {
2127                                         if (offset > -1)
2128                                                   offset += newlen;
2129                               }
2130                               if (offset > -1)
2131                                         offset++;
2132                               newlen = 0;
2133                               break;
2134 #ifdef NLS
2135                     toohard:/*FALLTHROUGH*/
2136 #endif
2137                     default:
2138                               /* Anything here makes it impossible or too hard
2139                                * to calculate the offset -- so we give up;
2140                                * save the last known good offset, in case the
2141                                * must sequence doesn't occur later.
2142                                */
2143                               if (newlen > (sopno)g->mlen) {                    /* ends one */
2144                                         start = newstart;
2145                                         g->mlen = newlen;
2146                                         if (offset > -1)
2147                                                   g->moffset += offset;
2148                                         else
2149                                                   g->moffset = offset;
2150                               }
2151                               offset = -1;
2152                               newlen = 0;
2153                               break;
2154                     }
2155           } while (OP(s) != OEND);
2156 
2157           if (g->mlen == 0) {           /* there isn't one */
2158                     g->moffset = -1;
2159                     return;
2160           }
2161 
2162           /* turn it into a character string */
2163           g->must = malloc((size_t)g->mlen + 1);
2164           if (g->must == NULL) {                  /* argh; just forget it */
2165                     g->mlen = 0;
2166                     g->moffset = -1;
2167                     return;
2168           }
2169           cp = g->must;
2170           scan = start;
2171           memset(&mbs, 0, sizeof(mbs));
2172           while (cp < g->must + g->mlen) {
2173                     while (OP(s = *scan++) != OCHAR)
2174                               continue;
2175 #ifdef NLS
2176                     size_t clen = wcrtomb(cp, (int)OPND(s), &mbs);
2177                     assert(clen != (size_t)-1);
2178                     cp += clen;
2179 #else
2180                     *cp++ = OPND(s);
2181 #endif
2182           }
2183           assert(cp == g->must + g->mlen);
2184           *cp++ = '\0';                 /* just on general principles */
2185 }
2186 
2187 /*
2188  - altoffset - choose biggest offset among multiple choices
2189  == static int altoffset(sop *scan, int offset);
2190  *
2191  * Compute, recursively if necessary, the largest offset among multiple
2192  * re paths.
2193  */
2194 static int
2195 altoffset(sop *scan, int offset)
2196 {
2197           int largest;
2198           int try;
2199           sop s;
2200 
2201           _DIAGASSERT(scan != NULL);
2202 
2203           /* If we gave up already on offsets, return */
2204           if (offset == -1)
2205                     return -1;
2206 
2207           largest = 0;
2208           try = 0;
2209           s = *scan++;
2210           while (OP(s) != O_QUEST && OP(s) != O_CH) {
2211                     switch (OP(s)) {
2212                     case OOR1:
2213                               if (try > largest)
2214                                         largest = try;
2215                               try = 0;
2216                               break;
2217                     case OQUEST_:
2218                     case OCH_:
2219                               try = altoffset(scan, try);
2220                               if (try == -1)
2221                                         return -1;
2222                               scan--;
2223                               do {
2224                                         scan += OPND(s);
2225                                         s = *scan;
2226                                         if (OP(s) != O_QUEST &&
2227                                             OP(s) != O_CH && OP(s) != OOR2)
2228                                                   return -1;
2229                               } while (OP(s) != O_QUEST && OP(s) != O_CH);
2230                               /* We must skip to the next position, or we'll
2231                                * leave altoffset() too early.
2232                                */
2233                               scan++;
2234                               break;
2235                     case OANYOF:
2236                     case OCHAR:
2237                     case OANY:
2238                               try++;
2239                               /*FALLTHROUGH*/
2240                     case OBOW:
2241                     case OEOW:
2242                     case OWBND:
2243                     case ONWBND:
2244                     case OLPAREN:
2245                     case ORPAREN:
2246                     case OOR2:
2247                               break;
2248                     default:
2249                               try = -1;
2250                               break;
2251                     }
2252                     if (try == -1)
2253                               return -1;
2254                     s = *scan++;
2255           }
2256 
2257           if (try > largest)
2258                     largest = try;
2259 
2260           return largest+offset;
2261 }
2262 
2263 /*
2264  - computejumps - compute char jumps for BM scan
2265  == static void computejumps(struct parse *p, struct re_guts *g);
2266  *
2267  * This algorithm assumes g->must exists and is has size greater than
2268  * zero. It's based on the algorithm found on Computer Algorithms by
2269  * Sara Baase.
2270  *
2271  * A char jump is the number of characters one needs to jump based on
2272  * the value of the character from the text that was mismatched.
2273  */
2274 static void
2275 computejumps(struct parse *p, struct re_guts *g)
2276 {
2277           int ch;
2278           size_t mindex;
2279 
2280           _DIAGASSERT(p != NULL);
2281           _DIAGASSERT(g != NULL);
2282 
2283           /* Avoid making errors worse */
2284           if (p->error != 0)
2285                     return;
2286 
2287           g->charjump = calloc((NC_MAX + 1), sizeof(*g->charjump));
2288           if (g->charjump == NULL)      /* Not a fatal error */
2289                     return;
2290           /* Adjust for signed chars, if necessary */
2291           g->charjump = &g->charjump[-(CHAR_MIN)];
2292 
2293           /* If the character does not exist in the pattern, the jump
2294            * is equal to the number of characters in the pattern.
2295            */
2296           for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++)
2297                     g->charjump[ch] = g->mlen;
2298 
2299           /* If the character does exist, compute the jump that would
2300            * take us to the last character in the pattern equal to it
2301            * (notice that we match right to left, so that last character
2302            * is the first one that would be matched).
2303            */
2304           for (mindex = 0; mindex < g->mlen; mindex++)
2305                     g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1;
2306 }
2307 
2308 /*
2309  - computematchjumps - compute match jumps for BM scan
2310  == static void computematchjumps(struct parse *p, struct re_guts *g);
2311  *
2312  * This algorithm assumes g->must exists and is has size greater than
2313  * zero. It's based on the algorithm found on Computer Algorithms by
2314  * Sara Baase.
2315  *
2316  * A match jump is the number of characters one needs to advance based
2317  * on the already-matched suffix.
2318  * Notice that all values here are minus (g->mlen-1), because of the way
2319  * the search algorithm works.
2320  */
2321 static void
2322 computematchjumps(struct parse *p, struct re_guts *g)
2323 {
2324           size_t mindex;                /* General "must" iterator */
2325           size_t suffix;                /* Keeps track of matching suffix */
2326           size_t ssuffix;               /* Keeps track of suffixes' suffix */
2327           size_t* pmatches;   /* pmatches[k] points to the next i
2328                                          * such that i+1...mlen is a substring
2329                                          * of k+1...k+mlen-i-1
2330                                          */
2331 
2332           _DIAGASSERT(p != NULL);
2333           _DIAGASSERT(g != NULL);
2334 
2335           /* Avoid making errors worse */
2336           if (p->error != 0)
2337                     return;
2338 
2339           pmatches = calloc(g->mlen, sizeof(*pmatches));
2340           if (pmatches == NULL) {
2341                     g->matchjump = NULL;
2342                     return;
2343           }
2344 
2345           g->matchjump = calloc(g->mlen, sizeof(*g->matchjump));
2346           if (g->matchjump == NULL) {   /* Not a fatal error */
2347                     free(pmatches);
2348                     return;
2349           }
2350 
2351           /* Set maximum possible jump for each character in the pattern */
2352           for (mindex = 0; mindex < g->mlen; mindex++)
2353                     g->matchjump[mindex] = 2 * g->mlen - mindex - 1;
2354 
2355           /* Compute pmatches[] */
2356           for (suffix = mindex = g->mlen; mindex-- > 0; suffix--) {
2357                     pmatches[mindex] = suffix;
2358 
2359                     /* If a mismatch is found, interrupting the substring,
2360                      * compute the matchjump for that position. If no
2361                      * mismatch is found, then a text substring mismatched
2362                      * against the suffix will also mismatch against the
2363                      * substring.
2364                      */
2365                     while (suffix < g->mlen
2366                         && g->must[mindex] != g->must[suffix]) {
2367                               g->matchjump[suffix] = MIN(g->matchjump[suffix],
2368                                   g->mlen - mindex - 1);
2369                               suffix = pmatches[suffix];
2370                     }
2371           }
2372 
2373           /* Compute the matchjump up to the last substring found to jump
2374            * to the beginning of the largest must pattern prefix matching
2375            * it's own suffix.
2376            */
2377           for (mindex = 0; mindex <= suffix; mindex++)
2378                     g->matchjump[mindex] = MIN(g->matchjump[mindex],
2379                         g->mlen + suffix - mindex);
2380 
2381         ssuffix = pmatches[suffix];
2382         while (suffix < g->mlen) {
2383                 while (suffix <= ssuffix && suffix < g->mlen) {
2384                         g->matchjump[suffix] = MIN(g->matchjump[suffix],
2385                                   g->mlen + ssuffix - suffix);
2386                         suffix++;
2387                 }
2388                     if (suffix < g->mlen)
2389                     ssuffix = pmatches[ssuffix];
2390         }
2391 
2392           free(pmatches);
2393 }
2394 
2395 /*
2396  - pluscount - count + nesting
2397  == static sopno pluscount(struct parse *p, struct re_guts *g);
2398  */
2399 static sopno                            /* nesting depth */
2400 pluscount(struct parse *p, struct re_guts *g)
2401 {
2402           sop *scan;
2403           sop s;
2404           sopno plusnest = 0;
2405           sopno maxnest = 0;
2406 
2407           _DIAGASSERT(p != NULL);
2408           _DIAGASSERT(g != NULL);
2409 
2410           if (p->error != 0)
2411                     return(0);          /* there may not be an OEND */
2412 
2413           scan = g->strip + 1;
2414           do {
2415                     s = *scan++;
2416                     switch (OP(s)) {
2417                     case OPLUS_:
2418                               plusnest++;
2419                               break;
2420                     case O_PLUS:
2421                               if (plusnest > maxnest)
2422                                         maxnest = plusnest;
2423                               plusnest--;
2424                               break;
2425                     }
2426           } while (OP(s) != OEND);
2427           if (plusnest != 0)
2428                     g->iflags |= BAD;
2429           return(maxnest);
2430 }
2431