1 /*
2  * regcomp and regexec -- regsub and regerror are elsewhere
3  *
4  *        Copyright (c) 1986 by University of Toronto.
5  *        Written by Henry Spencer.  Not derived from licensed software.
6  *
7  *        Permission is granted to anyone to use this software for any
8  *        purpose on any computer system, and to redistribute it freely,
9  *        subject to the following restrictions:
10  *
11  *        1. The author is not responsible for the consequences of use of
12  *                  this software, no matter how awful, even if they arise
13  *                  from defects in it.
14  *
15  *        2. The origin of this software must not be misrepresented, either
16  *                  by explicit claim or by omission.
17  *
18  *        3. Altered versions must be plainly marked as such, and must not
19  *                  be misrepresented as being the original software.
20  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
21  *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
22  *** to assist in implementing egrep.
23  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
24  *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
25  *** as in BSD grep and ex.
26  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
27  *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
28  *** THIS IS AN ALTERED VERSION.  It was altered by James A. Woods,
29  *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy.
30  *
31  * Beware that some of this code is subtly aware of the way operator
32  * precedence is structured in regular expressions.  Serious changes in
33  * regular-expression syntax might require a total rethink.
34  */
35 
36 #include <sys/cdefs.h>
37 #if defined(LIBC_SCCS) && !defined(lint)
38 __RCSID("$NetBSD: regexp.c,v 1.19 2016/01/26 16:05:18 christos Exp $");
39 #endif /* LIBC_SCCS and not lint */
40 
41 #include <ctype.h>
42 #include <regexp.h>
43 #include <stdio.h>
44 #include <stdlib.h>
45 #include <string.h>
46 #include "regmagic.h"
47 
48 /*
49  * The "internal use only" fields in regexp.h are present to pass info from
50  * compile to execute that permits the execute phase to run lots faster on
51  * simple cases.  They are:
52  *
53  * regstart         char that must begin a match; '\0' if none obvious
54  * reganch          is the match anchored (at beginning-of-line only)?
55  * regmust          string (pointer into program) that match must include, or NULL
56  * regmlen          length of regmust string
57  *
58  * Regstart and reganch permit very fast decisions on suitable starting points
59  * for a match, cutting down the work a lot.  Regmust permits fast rejection
60  * of lines that cannot possibly match.  The regmust tests are costly enough
61  * that regcomp() supplies a regmust only if the r.e. contains something
62  * potentially expensive (at present, the only such thing detected is * or +
63  * at the start of the r.e., which can involve a lot of backup).  Regmlen is
64  * supplied because the test in regexec() needs it and regcomp() is computing
65  * it anyway.
66  */
67 
68 /*
69  * Structure for regexp "program".  This is essentially a linear encoding
70  * of a nondeterministic finite-state machine (aka syntax charts or
71  * "railroad normal form" in parsing technology).  Each node is an opcode
72  * plus a "next" pointer, possibly plus an operand.  "Next" pointers of
73  * all nodes except BRANCH implement concatenation; a "next" pointer with
74  * a BRANCH on both ends of it is connecting two alternatives.  (Here we
75  * have one of the subtle syntax dependencies:  an individual BRANCH (as
76  * opposed to a collection of them) is never concatenated with anything
77  * because of operator precedence.)  The operand of some types of node is
78  * a literal string; for others, it is a node leading into a sub-FSM.  In
79  * particular, the operand of a BRANCH node is the first node of the branch.
80  * (NB this is *not* a tree structure:  the tail of the branch connects
81  * to the thing following the set of BRANCHes.)  The opcodes are:
82  */
83 
84 /* definition       number    opnd?     meaning */
85 #define   END       0         /* no     End of program. */
86 #define   BOL       1         /* no     Match "" at beginning of line. */
87 #define   EOL       2         /* no     Match "" at end of line. */
88 #define   ANY       3         /* no     Match any one character. */
89 #define   ANYOF     4         /* str    Match any character in this string. */
90 #define   ANYBUT    5         /* str    Match any character not in this string. */
91 #define   BRANCH    6         /* node   Match this alternative, or the next... */
92 #define   BACK      7         /* no     Match "", "next" ptr points backward. */
93 #define   EXACTLY   8         /* str    Match this string. */
94 #define   NOTHING   9         /* no     Match empty string. */
95 #define   STAR      10        /* node   Match this (simple) thing 0 or more times. */
96 #define   PLUS      11        /* node   Match this (simple) thing 1 or more times. */
97 #define   WORDA     12        /* no     Match "" at wordchar, where prev is nonword */
98 #define   WORDZ     13        /* no     Match "" at nonwordchar, where prev is word */
99 #define   OPEN      20        /* no     Mark this point in input as start of #n. */
100                               /*        OPEN+1 is number 1, etc. */
101 #define   CLOSE     30        /* no     Analogous to OPEN. */
102 
103 /*
104  * Opcode notes:
105  *
106  * BRANCH The set of branches constituting a single choice are hooked
107  *                  together with their "next" pointers, since precedence prevents
108  *                  anything being concatenated to any individual branch.  The
109  *                  "next" pointer of the last BRANCH in a choice points to the
110  *                  thing following the whole choice.  This is also where the
111  *                  final "next" pointer of each individual branch points; each
112  *                  branch starts with the operand node of a BRANCH node.
113  *
114  * BACK             Normal "next" pointers all implicitly point forward; BACK
115  *                  exists to make loop structures possible.
116  *
117  * STAR,PLUS        '?', and complex '*' and '+', are implemented as circular
118  *                  BRANCH structures using BACK.  Simple cases (one character
119  *                  per match) are implemented with STAR and PLUS for speed
120  *                  and to minimize recursive plunges.
121  *
122  * OPEN,CLOSE       ...are numbered at compile time.
123  */
124 
125 /*
126  * A node is one char of opcode followed by two chars of "next" pointer.
127  * "Next" pointers are stored as two 8-bit pieces, high order first.  The
128  * value is a positive offset from the opcode of the node containing it.
129  * An operand, if any, simply follows the node.  (Note that much of the
130  * code generation knows about this implicit relationship.)
131  *
132  * Using two bytes for the "next" pointer is vast overkill for most things,
133  * but allows patterns to get big without disasters.
134  */
135 #define   OP(p)     (*(p))
136 #define   NEXT(p)   (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
137 #define   OPERAND(p)          ((p) + 3)
138 
139 /*
140  * See regmagic.h for one further detail of program structure.
141  */
142 
143 
144 /*
145  * Utility definitions.
146  */
147 #ifndef CHARBITS
148 #define   UCHARAT(p)          ((int)*(unsigned char *)(p))
149 #else
150 #define   UCHARAT(p)          ((int)*(p)&CHARBITS)
151 #endif
152 
153 #define   FAIL(m)   { regerror(m); return(NULL); }
154 #define   ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?')
155 
156 /*
157  * Flags to be passed up and down.
158  */
159 #define   HASWIDTH  01        /* Known never to match null string. */
160 #define   SIMPLE              02        /* Simple enough to be STAR/PLUS operand. */
161 #define   SPSTART             04        /* Starts with * or +. */
162 #define   WORST               0         /* Worst case. */
163 
164 /*
165  * Global work variables for regcomp().
166  */
167 static char *regparse;                  /* Input-scan pointer. */
168 static int regnpar;           /* () count. */
169 static char regdummy;
170 static char *regcode;                   /* Code-emit pointer; &regdummy = don't. */
171 static long regsize;                    /* Code size. */
172 
173 /*
174  * Forward declarations for regcomp()'s friends.
175  */
176 #ifndef STATIC
177 #define   STATIC    static
178 #endif
179 STATIC char *reg __P((int, int *));
180 STATIC char *regbranch __P((int *));
181 STATIC char *regpiece __P((int *));
182 STATIC char *regatom __P((int *));
183 STATIC char *regnode __P((int));
184 STATIC char *regnext __P((char *));
185 STATIC void regc __P((int));
186 STATIC void reginsert __P((int, char *));
187 STATIC void regtail __P((char *, char *));
188 STATIC void regoptail __P((char *, char *));
189 #ifdef STRCSPN
190 STATIC int strcspn __P((char *, char *));
191 #endif
192 
193 /*
194  - regcomp - compile a regular expression into internal code
195  *
196  * We can't allocate space until we know how big the compiled form will be,
197  * but we can't compile it (and thus know how big it is) until we've got a
198  * place to put the code.  So we cheat:  we compile it twice, once with code
199  * generation turned off and size counting turned on, and once "for real".
200  * This also means that we don't allocate space until we are sure that the
201  * thing really will compile successfully, and we never have to move the
202  * code and thus invalidate pointers into it.  (Note that it has to be in
203  * one piece because free() must be able to free it all.)
204  *
205  * Beware that the optimization-preparation code in here knows about some
206  * of the structure of the compiled regexp.
207  */
208 regexp *
__compat_regcomp(expn)209 __compat_regcomp(expn)
210 const char *expn;
211 {
212           regexp *r;
213           char *scan;
214           char *longest;
215           int len;
216           int flags;
217 
218           if (expn == NULL)
219                     FAIL("NULL argument");
220 
221           /* First pass: determine size, legality. */
222 #ifdef notdef
223           if (expn[0] == '.' && expn[1] == '*') expn += 2;  /* aid grep */
224 #endif
225           /* LINTED const castaway */
226           regparse = (char *)expn;
227           regnpar = 1;
228           regsize = 0L;
229           regcode = &regdummy;
230           regc(MAGIC);
231           if (reg(0, &flags) == NULL)
232                     return(NULL);
233 
234           /* Small enough for pointer-storage convention? */
235           if (regsize >= 32767L)                  /* Probably could be 65535L. */
236                     FAIL("regexp too big");
237 
238           /* Allocate space. */
239           r = malloc(sizeof(regexp) + (unsigned)regsize);
240           if (r == NULL)
241                     FAIL("out of space");
242 
243           /* Second pass: emit code. */
244           /* LINTED const castaway */
245           regparse = (char *)expn;
246           regnpar = 1;
247           regcode = r->program;
248           regc(MAGIC);
249           if (reg(0, &flags) == NULL) {
250                     free(r);
251                     return(NULL);
252           }
253 
254           /* Dig out information for optimizations. */
255           r->regstart = '\0'; /* Worst-case defaults. */
256           r->reganch = 0;
257           r->regmust = NULL;
258           r->regmlen = 0;
259           scan = r->program+1;                              /* First BRANCH. */
260           if (OP(regnext(scan)) == END) {                   /* Only one top-level choice. */
261                     scan = OPERAND(scan);
262 
263                     /* Starting-point info. */
264                     if (OP(scan) == EXACTLY)
265                               r->regstart = *OPERAND(scan);
266                     else if (OP(scan) == BOL)
267                               r->reganch++;
268 
269                     /*
270                      * If there's something expensive in the r.e., find the
271                      * longest literal string that must appear and make it the
272                      * regmust.  Resolve ties in favor of later strings, since
273                      * the regstart check works with the beginning of the r.e.
274                      * and avoiding duplication strengthens checking.  Not a
275                      * strong reason, but sufficient in the absence of others.
276                      */
277                     if (flags&SPSTART) {
278                               longest = NULL;
279                               len = 0;
280                               for (; scan != NULL; scan = regnext(scan))
281                                         if (OP(scan) == EXACTLY && (int) strlen(OPERAND(scan)) >= len) {
282                                                   longest = OPERAND(scan);
283                                                   len = strlen(OPERAND(scan));
284                                         }
285                               r->regmust = longest;
286                               r->regmlen = len;
287                     }
288           }
289 
290           return(r);
291 }
292 
293 /*
294  - reg - regular expression, i.e. main body or parenthesized thing
295  *
296  * Caller must absorb opening parenthesis.
297  *
298  * Combining parenthesis handling with the base level of regular expression
299  * is a trifle forced, but the need to tie the tails of the branches to what
300  * follows makes it hard to avoid.
301  */
302 static char *
reg(paren,flagp)303 reg(paren, flagp)
304 int paren;                              /* Parenthesized? */
305 int *flagp;
306 {
307           char *ret;
308           char *br;
309           char *ender;
310           int parno = 0;
311           int flags;
312 
313           *flagp = HASWIDTH;  /* Tentatively. */
314 
315           /* Make an OPEN node, if parenthesized. */
316           if (paren) {
317                     if (regnpar >= NSUBEXP)
318                               FAIL("too many ()");
319                     parno = regnpar;
320                     regnpar++;
321                     ret = regnode(OPEN+parno);
322           } else
323                     ret = NULL;
324 
325           /* Pick up the branches, linking them together. */
326           br = regbranch(&flags);
327           if (br == NULL)
328                     return(NULL);
329           if (ret != NULL)
330                     regtail(ret, br);   /* OPEN -> first. */
331           else
332                     ret = br;
333           if (!(flags&HASWIDTH))
334                     *flagp &= ~HASWIDTH;
335           *flagp |= flags&SPSTART;
336           while (*regparse == '|' || *regparse == '\n') {
337                     regparse++;
338                     br = regbranch(&flags);
339                     if (br == NULL)
340                               return(NULL);
341                     regtail(ret, br);   /* BRANCH -> BRANCH. */
342                     if (!(flags&HASWIDTH))
343                               *flagp &= ~HASWIDTH;
344                     *flagp |= flags&SPSTART;
345           }
346 
347           /* Make a closing node, and hook it on the end. */
348           ender = regnode((paren) ? CLOSE+parno : END);
349           regtail(ret, ender);
350 
351           /* Hook the tails of the branches to the closing node. */
352           for (br = ret; br != NULL; br = regnext(br))
353                     regoptail(br, ender);
354 
355           /* Check for proper termination. */
356           if (paren && *regparse++ != ')') {
357                     FAIL("unmatched ()");
358           } else if (!paren && *regparse != '\0') {
359                     if (*regparse == ')') {
360                               FAIL("unmatched ()");
361                     } else
362                               FAIL("junk on end");          /* "Can't happen". */
363                     /* NOTREACHED */
364           }
365 
366           return(ret);
367 }
368 
369 /*
370  - regbranch - one alternative of an | operator
371  *
372  * Implements the concatenation operator.
373  */
374 static char *
regbranch(flagp)375 regbranch(flagp)
376 int *flagp;
377 {
378           char *ret;
379           char *chain;
380           char *latest;
381           int flags;
382 
383           *flagp = WORST;               /* Tentatively. */
384 
385           ret = regnode(BRANCH);
386           chain = NULL;
387           while (*regparse != '\0' && *regparse != ')' &&
388                  *regparse != '\n' && *regparse != '|') {
389                     latest = regpiece(&flags);
390                     if (latest == NULL)
391                               return(NULL);
392                     *flagp |= flags&HASWIDTH;
393                     if (chain == NULL)  /* First piece. */
394                               *flagp |= flags&SPSTART;
395                     else
396                               regtail(chain, latest);
397                     chain = latest;
398           }
399           if (chain == NULL)  /* Loop ran zero times. */
400                     (void) regnode(NOTHING);
401 
402           return(ret);
403 }
404 
405 /*
406  - regpiece - something followed by possible [*+?]
407  *
408  * Note that the branching code sequences used for ? and the general cases
409  * of * and + are somewhat optimized:  they use the same NOTHING node as
410  * both the endmarker for their branch list and the body of the last branch.
411  * It might seem that this node could be dispensed with entirely, but the
412  * endmarker role is not redundant.
413  */
414 static char *
regpiece(flagp)415 regpiece(flagp)
416 int *flagp;
417 {
418           char *ret;
419           char op;
420           char *next;
421           int flags;
422 
423           ret = regatom(&flags);
424           if (ret == NULL)
425                     return(NULL);
426 
427           op = *regparse;
428           if (!ISMULT(op)) {
429                     *flagp = flags;
430                     return(ret);
431           }
432 
433           if (!(flags&HASWIDTH) && op != '?')
434                     FAIL("*+ operand could be empty");
435           *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
436 
437           if (op == '*' && (flags&SIMPLE))
438                     reginsert(STAR, ret);
439           else if (op == '*') {
440                     /* Emit x* as (x&|), where & means "self". */
441                     reginsert(BRANCH, ret);                           /* Either x */
442                     regoptail(ret, regnode(BACK));                    /* and loop */
443                     regoptail(ret, ret);                              /* back */
444                     regtail(ret, regnode(BRANCH));                    /* or */
445                     regtail(ret, regnode(NOTHING));                   /* null. */
446           } else if (op == '+' && (flags&SIMPLE))
447                     reginsert(PLUS, ret);
448           else if (op == '+') {
449                     /* Emit x+ as x(&|), where & means "self". */
450                     next = regnode(BRANCH);                           /* Either */
451                     regtail(ret, next);
452                     regtail(regnode(BACK), ret);            /* loop back */
453                     regtail(next, regnode(BRANCH));                   /* or */
454                     regtail(ret, regnode(NOTHING));                   /* null. */
455           } else if (op == '?') {
456                     /* Emit x? as (x|) */
457                     reginsert(BRANCH, ret);                           /* Either x */
458                     regtail(ret, regnode(BRANCH));                    /* or */
459                     next = regnode(NOTHING);                /* null. */
460                     regtail(ret, next);
461                     regoptail(ret, next);
462           }
463           regparse++;
464           if (ISMULT(*regparse))
465                     FAIL("nested *?+");
466 
467           return(ret);
468 }
469 
470 /*
471  - regatom - the lowest level
472  *
473  * Optimization:  gobbles an entire sequence of ordinary characters so that
474  * it can turn them into a single node, which is smaller to store and
475  * faster to run.  Backslashed characters are exceptions, each becoming a
476  * separate node; the code is simpler that way and it's not worth fixing.
477  */
478 static char *
regatom(flagp)479 regatom(flagp)
480 int *flagp;
481 {
482           char *ret;
483           int flags;
484 
485           *flagp = WORST;               /* Tentatively. */
486 
487           switch (*regparse++) {
488           /* FIXME: these chars only have meaning at beg/end of pat? */
489           case '^':
490                     ret = regnode(BOL);
491                     break;
492           case '$':
493                     ret = regnode(EOL);
494                     break;
495           case '.':
496                     ret = regnode(ANY);
497                     *flagp |= HASWIDTH|SIMPLE;
498                     break;
499           case '[': {
500                               int class;
501                               int classend;
502 
503                               if (*regparse == '^') {       /* Complement of range. */
504                                         ret = regnode(ANYBUT);
505                                         regparse++;
506                               } else
507                                         ret = regnode(ANYOF);
508                               if (*regparse == ']' || *regparse == '-')
509                                         regc(*regparse++);
510                               while (*regparse != '\0' && *regparse != ']') {
511                                         if (*regparse == '-') {
512                                                   regparse++;
513                                                   if (*regparse == ']' || *regparse == '\0')
514                                                             regc('-');
515                                                   else {
516                                                             class = UCHARAT(regparse-2)+1;
517                                                             classend = UCHARAT(regparse);
518                                                             if (class > classend+1)
519                                                                       FAIL("invalid [] range");
520                                                             for (; class <= classend; class++)
521                                                                       regc(class);
522                                                             regparse++;
523                                                   }
524                                         } else
525                                                   regc(*regparse++);
526                               }
527                               regc('\0');
528                               if (*regparse != ']')
529                                         FAIL("unmatched []");
530                               regparse++;
531                               *flagp |= HASWIDTH|SIMPLE;
532                     }
533                     break;
534           case '(':
535                     ret = reg(1, &flags);
536                     if (ret == NULL)
537                               return(NULL);
538                     *flagp |= flags&(HASWIDTH|SPSTART);
539                     break;
540           case '\0':
541           case '|':
542           case '\n':
543           case ')':
544                     FAIL("internal urp");         /* Supposed to be caught earlier. */
545           case '?':
546           case '+':
547           case '*':
548                     FAIL("?+* follows nothing");
549           case '\\':
550                     switch (*regparse++) {
551                     case '\0':
552                               FAIL("trailing \\");
553                     case '<':
554                               ret = regnode(WORDA);
555                               break;
556                     case '>':
557                               ret = regnode(WORDZ);
558                               break;
559                     /* FIXME: Someday handle \1, \2, ... */
560                     default:
561                               /* Handle general quoted chars in exact-match routine */
562                               goto de_fault;
563                     }
564                     break;
565           de_fault:
566                     /*FALLTHROUGH*/
567           default:
568                     /*
569                      * Encode a string of characters to be matched exactly.
570                      *
571                      * This is a bit tricky due to quoted chars and due to
572                      * '*', '+', and '?' taking the SINGLE char previous
573                      * as their operand.
574                      *
575                      * On entry, the char at regparse[-1] is going to go
576                      * into the string, no matter what it is.  (It could be
577                      * following a \ if we are entered from the '\' case.)
578                      *
579                      * Basic idea is to pick up a good char in  ch  and
580                      * examine the next char.  If it's *+? then we twiddle.
581                      * If it's \ then we frozzle.  If it's other magic char
582                      * we push  ch  and terminate the string.  If none of the
583                      * above, we push  ch  on the string and go around again.
584                      *
585                      *  regprev  is used to remember where "the current char"
586                      * starts in the string, if due to a *+? we need to back
587                      * up and put the current char in a separate, 1-char, string.
588                      * When  regprev  is NULL,  ch  is the only char in the
589                      * string; this is used in *+? handling, and in setting
590                      * flags |= SIMPLE at the end.
591                      */
592                     {
593                               char *regprev;
594                               char ch;
595 
596                               regparse--;                             /* Look at cur char */
597                               ret = regnode(EXACTLY);
598                               for ( regprev = 0 ; ; ) {
599                                         ch = *regparse++;   /* Get current char */
600                                         switch (*regparse) {          /* look at next one */
601 
602                                         default:
603                                                   regc(ch); /* Add cur to string */
604                                                   break;
605 
606                                         case '.': case '[': case '(':
607                                         case ')': case '|': case '\n':
608                                         case '$': case '^':
609                                         case '\0':
610                                         /* FIXME, $ and ^ should not always be magic */
611                                         magic:
612                                                   regc(ch); /* dump cur char */
613                                                   goto done;          /* and we are done */
614 
615                                         case '?': case '+': case '*':
616                                                   if (!regprev)       /* If just ch in str, */
617                                                             goto magic;         /* use it */
618                                                   /* End mult-char string one early */
619                                                   regparse = regprev; /* Back up parse */
620                                                   goto done;
621 
622                                         case '\\':
623                                                   regc(ch); /* Cur char OK */
624                                                   switch (regparse[1]){ /* Look after \ */
625                                                   case '\0':
626                                                   case '<':
627                                                   case '>':
628                                                   /* FIXME: Someday handle \1, \2, ... */
629                                                             goto done; /* Not quoted */
630                                                   default:
631                                                             /* Backup point is \, scan                                                                 * point is after it. */
632                                                             regprev = regparse;
633                                                             regparse++;
634                                                             continue; /* NOT break; */
635                                                   }
636                                         }
637                                         regprev = regparse; /* Set backup point */
638                               }
639                     done:
640                               regc('\0');
641                               *flagp |= HASWIDTH;
642                               if (!regprev)                 /* One char? */
643                                         *flagp |= SIMPLE;
644                     }
645                     break;
646           }
647 
648           return(ret);
649 }
650 
651 /*
652  - regnode - emit a node
653  */
654 static char *                           /* Location. */
regnode(op)655 regnode(op)
656 int op;
657 {
658           char *ret;
659           char *ptr;
660 
661           ret = regcode;
662           if (ret == &regdummy) {
663                     regsize += 3;
664                     return(ret);
665           }
666 
667           ptr = ret;
668           *ptr++ = op;
669           *ptr++ = '\0';                /* Null "next" pointer. */
670           *ptr++ = '\0';
671           regcode = ptr;
672 
673           return(ret);
674 }
675 
676 /*
677  - regc - emit (if appropriate) a byte of code
678  */
679 static void
regc(b)680 regc(b)
681 int b;
682 {
683           if (regcode != &regdummy)
684                     *regcode++ = b;
685           else
686                     regsize++;
687 }
688 
689 /*
690  - reginsert - insert an operator in front of already-emitted operand
691  *
692  * Means relocating the operand.
693  */
694 static void
reginsert(op,opnd)695 reginsert(op, opnd)
696 int op;
697 char *opnd;
698 {
699           char *src;
700           char *dst;
701           char *place;
702 
703           if (regcode == &regdummy) {
704                     regsize += 3;
705                     return;
706           }
707 
708           src = regcode;
709           regcode += 3;
710           dst = regcode;
711           while (src > opnd)
712                     *--dst = *--src;
713 
714           place = opnd;                 /* Op node, where operand used to be. */
715           *place++ = op;
716           *place++ = '\0';
717           *place++ = '\0';
718 }
719 
720 /*
721  - regtail - set the next-pointer at the end of a node chain
722  */
723 static void
regtail(p,val)724 regtail(p, val)
725 char *p;
726 char *val;
727 {
728           char *scan;
729           char *temp;
730           int offset;
731 
732           if (p == &regdummy)
733                     return;
734 
735           /* Find last node. */
736           scan = p;
737           for (;;) {
738                     temp = regnext(scan);
739                     if (temp == NULL)
740                               break;
741                     scan = temp;
742           }
743 
744           if (OP(scan) == BACK)
745                     offset = scan - val;
746           else
747                     offset = val - scan;
748           *(scan+1) = ((unsigned int)offset>>8)&0377;
749           *(scan+2) = offset&0377;
750 }
751 
752 /*
753  - regoptail - regtail on operand of first argument; nop if operandless
754  */
755 static void
regoptail(p,val)756 regoptail(p, val)
757 char *p;
758 char *val;
759 {
760           /* "Operandless" and "op != BRANCH" are synonymous in practice. */
761           if (p == NULL || p == &regdummy || OP(p) != BRANCH)
762                     return;
763           regtail(OPERAND(p), val);
764 }
765 
766 /*
767  * regexec and friends
768  */
769 
770 /*
771  * Global work variables for regexec().
772  */
773 static char *reginput;                  /* String-input pointer. */
774 static char *regbol;                    /* Beginning of input, for ^ check. */
775 static char **regstartp;      /* Pointer to startp array. */
776 static char **regendp;                  /* Ditto for endp. */
777 
778 /*
779  * Forwards.
780  */
781 STATIC int regtry __P((const regexp *, const char *));
782 STATIC int regmatch __P((char *));
783 STATIC int regrepeat __P((char *));
784 
785 #ifdef DEBUG
786 int regnarrate = 0;
787 void regdump __P((regexp *));
788 STATIC char *regprop __P((char *));
789 #endif
790 
791 /*
792  - regexec - match a regexp against a string
793  */
794 int
__compat_regexec(prog,string)795 __compat_regexec(prog, string)
796 const regexp *prog;
797 const char *string;
798 {
799           char *s;
800 
801           /* Be paranoid... */
802           if (prog == NULL || string == NULL) {
803                     regerror("NULL parameter");
804                     return(0);
805           }
806 
807           /* Check validity of program. */
808           if (UCHARAT(prog->program) != MAGIC) {
809                     regerror("corrupted program");
810                     return(0);
811           }
812 
813           /* If there is a "must appear" string, look for it. */
814           if (prog->regmust != NULL) {
815                     /* LINTED const castaway */
816                     s = (char *)string;
817                     while ((s = strchr(s, prog->regmust[0])) != NULL) {
818                               if (strncmp(s, prog->regmust,
819                                   (size_t)prog->regmlen) == 0)
820                                         break;    /* Found it. */
821                               s++;
822                     }
823                     if (s == NULL)      /* Not present. */
824                               return(0);
825           }
826 
827           /* Mark beginning of line for ^ . */
828           /* LINTED const castaway */
829           regbol = (char *)string;
830 
831           /* Simplest case:  anchored match need be tried only once. */
832           if (prog->reganch)
833                     return(regtry(prog, string));
834 
835           /* Messy cases:  unanchored match. */
836           /* LINTED const castaway */
837           s = (char *)string;
838           if (prog->regstart != '\0')
839                     /* We know what char it must start with. */
840                     while ((s = strchr(s, prog->regstart)) != NULL) {
841                               if (regtry(prog, s))
842                                         return(1);
843                               s++;
844                     }
845           else
846                     /* We don't -- general case. */
847                     do {
848                               if (regtry(prog, s))
849                                         return(1);
850                     } while (*s++ != '\0');
851 
852           /* Failure. */
853           return(0);
854 }
855 
856 /*
857  - regtry - try match at specific point
858  */
859 static int                              /* 0 failure, 1 success */
regtry(prog,string)860 regtry(prog, string)
861 const regexp *prog;
862 const char *string;
863 {
864           int i;
865           char **sp;
866           char **ep;
867 
868           /* LINTED const castaway */
869           reginput = (char *)string;                                  /* XXX */
870           regstartp = (char **)prog->startp;                          /* XXX */
871           regendp = (char **)prog->endp;                                        /* XXX */
872 
873           sp = (char **)prog->startp;                                 /* XXX */
874           ep = (char **)prog->endp;                                   /* XXX */
875           for (i = NSUBEXP; i > 0; i--) {
876                     *sp++ = NULL;
877                     *ep++ = NULL;
878           }
879           if (regmatch((char *)prog->program + 1)) {                  /* XXX */
880                     /* LINTED const castaway */
881                     ((regexp *)prog)->startp[0] = (char *)string;     /* XXX */
882                     /* LINTED const castaway */
883                     ((regexp *)prog)->endp[0] = reginput;             /* XXX */
884                     return(1);
885           } else
886                     return(0);
887 }
888 
889 /*
890  - regmatch - main matching routine
891  *
892  * Conceptually the strategy is simple:  check to see whether the current
893  * node matches, call self recursively to see whether the rest matches,
894  * and then act accordingly.  In practice we make some effort to avoid
895  * recursion, in particular by going through "ordinary" nodes (that don't
896  * need to know whether the rest of the match failed) by a loop instead of
897  * by recursion.
898  */
899 static int                              /* 0 failure, 1 success */
regmatch(prog)900 regmatch(prog)
901 char *prog;
902 {
903           char *scan;         /* Current node. */
904           char *next;                   /* Next node. */
905 
906           scan = prog;
907 #ifdef DEBUG
908           if (scan != NULL && regnarrate)
909                     fprintf(stderr, "%s(\n", regprop(scan));
910 #endif
911           while (scan != NULL) {
912 #ifdef DEBUG
913                     if (regnarrate)
914                               fprintf(stderr, "%s...\n", regprop(scan));
915 #endif
916                     next = regnext(scan);
917 
918                     switch (OP(scan)) {
919                     case BOL:
920                               if (reginput != regbol)
921                                         return(0);
922                               break;
923                     case EOL:
924                               if (*reginput != '\0')
925                                         return(0);
926                               break;
927                     case WORDA:
928                               /* Must be looking at a letter, digit, or _ */
929                               if ((!isalnum(UCHARAT(reginput))) && *reginput != '_')
930                                         return(0);
931                               /* Prev must be BOL or nonword */
932                               if (reginput > regbol &&
933                                   (isalnum(UCHARAT(reginput - 1))
934                                    || reginput[-1] == '_'))
935                                         return(0);
936                               break;
937                     case WORDZ:
938                               /* Must be looking at non letter, digit, or _ */
939                               if (isalnum(UCHARAT(reginput)) || *reginput == '_')
940                                         return(0);
941                               /* We don't care what the previous char was */
942                               break;
943                     case ANY:
944                               if (*reginput == '\0')
945                                         return(0);
946                               reginput++;
947                               break;
948                     case EXACTLY: {
949                                         int len;
950                                         char *opnd;
951 
952                                         opnd = OPERAND(scan);
953                                         /* Inline the first character, for speed. */
954                                         if (*opnd != *reginput)
955                                                   return(0);
956                                         len = strlen(opnd);
957                                         if (len > 1 && strncmp(opnd, reginput,
958                                             (size_t)len) != 0)
959                                                   return(0);
960                                         reginput += len;
961                               }
962                               break;
963                     case ANYOF:
964                               if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL)
965                                         return(0);
966                               reginput++;
967                               break;
968                     case ANYBUT:
969                               if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL)
970                                         return(0);
971                               reginput++;
972                               break;
973                     case NOTHING:
974                               break;
975                     case BACK:
976                               break;
977                     case OPEN+1:
978                     case OPEN+2:
979                     case OPEN+3:
980                     case OPEN+4:
981                     case OPEN+5:
982                     case OPEN+6:
983                     case OPEN+7:
984                     case OPEN+8:
985                     case OPEN+9: {
986                                         int no;
987                                         char *save;
988 
989                                         no = OP(scan) - OPEN;
990                                         save = reginput;
991 
992                                         if (regmatch(next)) {
993                                                   /*
994                                                    * Don't set startp if some later
995                                                    * invocation of the same parentheses
996                                                    * already has.
997                                                    */
998                                                   if (regstartp[no] == NULL)
999                                                             regstartp[no] = save;
1000                                                   return(1);
1001                                         } else
1002                                                   return(0);
1003                               }
1004                     case CLOSE+1:
1005                     case CLOSE+2:
1006                     case CLOSE+3:
1007                     case CLOSE+4:
1008                     case CLOSE+5:
1009                     case CLOSE+6:
1010                     case CLOSE+7:
1011                     case CLOSE+8:
1012                     case CLOSE+9: {
1013                                         int no;
1014                                         char *save;
1015 
1016                                         no = OP(scan) - CLOSE;
1017                                         save = reginput;
1018 
1019                                         if (regmatch(next)) {
1020                                                   /*
1021                                                    * Don't set endp if some later
1022                                                    * invocation of the same parentheses
1023                                                    * already has.
1024                                                    */
1025                                                   if (regendp[no] == NULL)
1026                                                             regendp[no] = save;
1027                                                   return(1);
1028                                         } else
1029                                                   return(0);
1030                               }
1031                     case BRANCH: {
1032                                         char *save;
1033 
1034                                         if (OP(next) != BRANCH)                 /* No choice. */
1035                                                   next = OPERAND(scan);         /* Avoid recursion. */
1036                                         else {
1037                                                   do {
1038                                                             save = reginput;
1039                                                             if (regmatch(OPERAND(scan)))
1040                                                                       return(1);
1041                                                             reginput = save;
1042                                                             scan = regnext(scan);
1043                                                   } while (scan != NULL && OP(scan) == BRANCH);
1044                                                   return(0);
1045                                                   /* NOTREACHED */
1046                                         }
1047                               }
1048                               break;
1049                     case STAR:
1050                     case PLUS: {
1051                                         char nextch;
1052                                         int no;
1053                                         char *save;
1054                                         int min;
1055 
1056                                         /*
1057                                          * Lookahead to avoid useless match attempts
1058                                          * when we know what character comes next.
1059                                          */
1060                                         nextch = '\0';
1061                                         if (OP(next) == EXACTLY)
1062                                                   nextch = *OPERAND(next);
1063                                         min = (OP(scan) == STAR) ? 0 : 1;
1064                                         save = reginput;
1065                                         no = regrepeat(OPERAND(scan));
1066                                         while (no >= min) {
1067                                                   /* If it could work, try it. */
1068                                                   if (nextch == '\0' || *reginput == nextch)
1069                                                             if (regmatch(next))
1070                                                                       return(1);
1071                                                   /* Couldn't or didn't -- back up. */
1072                                                   no--;
1073                                                   reginput = save + no;
1074                                         }
1075                                         return(0);
1076                               }
1077                     case END:
1078                               return(1);          /* Success! */
1079                     default:
1080                               regerror("memory corruption");
1081                               return(0);
1082                     }
1083 
1084                     scan = next;
1085           }
1086 
1087           /*
1088            * We get here only if there's trouble -- normally "case END" is
1089            * the terminating point.
1090            */
1091           regerror("corrupted pointers");
1092           return(0);
1093 }
1094 
1095 /*
1096  - regrepeat - repeatedly match something simple, report how many
1097  */
1098 static int
regrepeat(p)1099 regrepeat(p)
1100 char *p;
1101 {
1102           int count = 0;
1103           char *scan;
1104           char *opnd;
1105 
1106           scan = reginput;
1107           opnd = OPERAND(p);
1108           switch (OP(p)) {
1109           case ANY:
1110                     count = strlen(scan);
1111                     scan += count;
1112                     break;
1113           case EXACTLY:
1114                     while (*opnd == *scan) {
1115                               count++;
1116                               scan++;
1117                     }
1118                     break;
1119           case ANYOF:
1120                     while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
1121                               count++;
1122                               scan++;
1123                     }
1124                     break;
1125           case ANYBUT:
1126                     while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
1127                               count++;
1128                               scan++;
1129                     }
1130                     break;
1131           default:            /* Oh dear.  Called inappropriately. */
1132                     regerror("internal foulup");
1133                     count = 0;          /* Best compromise. */
1134                     break;
1135           }
1136           reginput = scan;
1137 
1138           return(count);
1139 }
1140 
1141 /*
1142  - regnext - dig the "next" pointer out of a node
1143  */
1144 static char *
regnext(p)1145 regnext(p)
1146 char *p;
1147 {
1148           int offset;
1149 
1150           if (p == &regdummy)
1151                     return(NULL);
1152 
1153           offset = NEXT(p);
1154           if (offset == 0)
1155                     return(NULL);
1156 
1157           if (OP(p) == BACK)
1158                     return(p-offset);
1159           else
1160                     return(p+offset);
1161 }
1162 
1163 #ifdef DEBUG
1164 
1165 /*
1166  - regdump - dump a regexp onto stdout in vaguely comprehensible form
1167  */
1168 void
regdump(r)1169 regdump(r)
1170 regexp *r;
1171 {
1172           char *s;
1173           char op = EXACTLY;  /* Arbitrary non-END op. */
1174           char *next;
1175 
1176 
1177           s = r->program + 1;
1178           while (op != END) { /* While that wasn't END last time... */
1179                     op = OP(s);
1180                     printf("%2td%s", s-r->program, regprop(s));       /* Where, what. */
1181                     next = regnext(s);
1182                     if (next == NULL)             /* Next ptr. */
1183                               printf("(0)");
1184                     else
1185                               printf("(%td)", (s-r->program)+(next-s));
1186                     s += 3;
1187                     if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
1188                               /* Literal string, where present. */
1189                               while (*s != '\0') {
1190                                         putchar(*s);
1191                                         s++;
1192                               }
1193                               s++;
1194                     }
1195                     putchar('\n');
1196           }
1197 
1198           /* Header fields of interest. */
1199           if (r->regstart != '\0')
1200                     printf("start `%c' ", r->regstart);
1201           if (r->reganch)
1202                     printf("anchored ");
1203           if (r->regmust != NULL)
1204                     printf("must have \"%s\"", r->regmust);
1205           printf("\n");
1206 }
1207 
1208 /*
1209  - regprop - printable representation of opcode
1210  */
1211 static char *
regprop(op)1212 regprop(op)
1213 char *op;
1214 {
1215           char *p;
1216           static char buf[50];
1217 
1218           (void)strncpy(buf, ":", sizeof(buf) - 1);
1219 
1220           switch (OP(op)) {
1221           case BOL:
1222                     p = "BOL";
1223                     break;
1224           case EOL:
1225                     p = "EOL";
1226                     break;
1227           case ANY:
1228                     p = "ANY";
1229                     break;
1230           case ANYOF:
1231                     p = "ANYOF";
1232                     break;
1233           case ANYBUT:
1234                     p = "ANYBUT";
1235                     break;
1236           case BRANCH:
1237                     p = "BRANCH";
1238                     break;
1239           case EXACTLY:
1240                     p = "EXACTLY";
1241                     break;
1242           case NOTHING:
1243                     p = "NOTHING";
1244                     break;
1245           case BACK:
1246                     p = "BACK";
1247                     break;
1248           case END:
1249                     p = "END";
1250                     break;
1251           case OPEN+1:
1252           case OPEN+2:
1253           case OPEN+3:
1254           case OPEN+4:
1255           case OPEN+5:
1256           case OPEN+6:
1257           case OPEN+7:
1258           case OPEN+8:
1259           case OPEN+9:
1260                     (void)snprintf(buf+strlen(buf), sizeof(buf) - strlen(buf),
1261                         "OPEN%d", OP(op)-OPEN);
1262                     p = NULL;
1263                     break;
1264           case CLOSE+1:
1265           case CLOSE+2:
1266           case CLOSE+3:
1267           case CLOSE+4:
1268           case CLOSE+5:
1269           case CLOSE+6:
1270           case CLOSE+7:
1271           case CLOSE+8:
1272           case CLOSE+9:
1273                     (void)snprintf(buf+strlen(buf), sizeof(buf) - strlen(buf),
1274                         "CLOSE%d", OP(op)-CLOSE);
1275                     p = NULL;
1276                     break;
1277           case STAR:
1278                     p = "STAR";
1279                     break;
1280           case PLUS:
1281                     p = "PLUS";
1282                     break;
1283           case WORDA:
1284                     p = "WORDA";
1285                     break;
1286           case WORDZ:
1287                     p = "WORDZ";
1288                     break;
1289           default:
1290                     p = NULL;
1291                     regerror("corrupted opcode");
1292                     break;
1293           }
1294           if (p != NULL)
1295                     (void)strncat(buf, p, sizeof(buf) - strlen(buf) - 1);
1296           return(buf);
1297 }
1298 #endif
1299 
1300 /*
1301  * The following is provided for those people who do not have strcspn() in
1302  * their C libraries.  They should get off their butts and do something
1303  * about it; at least one public-domain implementation of those (highly
1304  * useful) string routines has been published on Usenet.
1305  */
1306 #ifdef STRCSPN
1307 /*
1308  * strcspn - find length of initial segment of s1 consisting entirely
1309  * of characters not from s2
1310  */
1311 
1312 static int
strcspn(s1,s2)1313 strcspn(s1, s2)
1314 char *s1;
1315 char *s2;
1316 {
1317           char *scan1;
1318           char *scan2;
1319           int count;
1320 
1321           count = 0;
1322           for (scan1 = s1; *scan1 != '\0'; scan1++) {
1323                     for (scan2 = s2; *scan2 != '\0';)       /* ++ moved down. */
1324                               if (*scan1 == *scan2++)
1325                                         return(count);
1326                     count++;
1327           }
1328           return(count);
1329 }
1330 #endif
1331