1 /*        $NetBSD: engine.c,v 1.4 2024/11/10 10:54:29 mlelstv Exp $ */
2 /*-
3  * Copyright (c) 1992, 1993, 1994 Henry Spencer.
4  * Copyright (c) 1992, 1993, 1994
5  *        The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * Henry Spencer of the University of Toronto.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *        This product includes software developed by the University of
21  *        California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *        @(#)engine.c        8.4 (Berkeley) 3/19/94
39  */
40 
41 /*
42  * The matching engine and friends.  This file is #included by regexec.c
43  * after suitable #defines of a variety of macros used herein, so that
44  * different state representations can be used without duplicating masses
45  * of code.
46  */
47 
48 #ifdef SNAMES
49 #define   matcher   smatcher
50 #define   fast      sfast
51 #define   slow      sslow
52 #define   dissect   sdissect
53 #define   backref   sbackref
54 #define   step      sstep
55 #define   print     sprint
56 #define   at        sat
57 #define   match     smat
58 #endif
59 #ifdef LNAMES
60 #define   matcher   lmatcher
61 #define   fast      lfast
62 #define   slow      lslow
63 #define   dissect   ldissect
64 #define   backref   lbackref
65 #define   step      lstep
66 #define   print     lprint
67 #define   at        lat
68 #define   match     lmat
69 #endif
70 
71 /* another structure passed up and down to avoid zillions of parameters */
72 struct match {
73           struct re_guts *g;
74           int eflags;
75           regmatch_t *pmatch; /* [nsub+1] (0 element unused) */
76           RCHAR_T *offp;                /* offsets work from here */
77           RCHAR_T *beginp;              /* start of string -- virtual NUL precedes */
78           RCHAR_T *endp;                /* end of string -- virtual NUL here */
79           RCHAR_T *coldp;               /* can be no match starting before here */
80           RCHAR_T **lastpos;  /* [nplus+1] */
81           STATEVARS;
82           states st;                    /* current states */
83           states fresh;                 /* states for a fresh start */
84           states tmp;                   /* temporary */
85           states empty;                 /* empty set of states */
86 };
87 
88 /* ========= begin header generated by ./mkh ========= */
89 #ifdef __cplusplus
90 extern "C" {
91 #endif
92 
93 /* === engine.c === */
94 static int matcher __P((struct re_guts *g, RCHAR_T *string, size_t nmatch, regmatch_t pmatch[], int eflags));
95 static RCHAR_T *dissect __P((struct match *m, RCHAR_T *start, RCHAR_T *stop, sopno startst, sopno stopst));
96 static RCHAR_T *backref __P((struct match *m, RCHAR_T *start, RCHAR_T *stop, sopno startst, sopno stopst, sopno lev));
97 static RCHAR_T *fast __P((struct match *m, RCHAR_T *start, RCHAR_T *stop, sopno startst, sopno stopst));
98 static RCHAR_T *slow __P((struct match *m, RCHAR_T *start, RCHAR_T *stop, sopno startst, sopno stopst));
99 static states step __P((struct re_guts *g, sopno start, sopno stop, states bef, int flag, RCHAR_T ch, states aft));
100 #define   BOL       (1)
101 #define   EOL       (BOL+1)
102 #define   BOLEOL    (BOL+2)
103 #define   NOTHING   (BOL+3)
104 #define   BOW       (BOL+4)
105 #define   EOW       (BOL+5)
106 #ifdef REDEBUG
107 static void print __P((struct match *m, char *caption, states st, int ch, FILE *d));
108 #endif
109 #ifdef REDEBUG
110 static void at __P((struct match *m, char *title, char *start, char *stop, sopno startst, sopno stopst));
111 #endif
112 #ifdef REDEBUG
113 static char *pchar __P((int ch));
114 #endif
115 
116 #ifdef __cplusplus
117 }
118 #endif
119 /* ========= end header generated by ./mkh ========= */
120 
121 #ifdef REDEBUG
122 #define   SP(t, s, c)         print(m, t, s, c, stdout)
123 #define   AT(t, p1, p2, s1, s2)         at(m, t, p1, p2, s1, s2)
124 #define   NOTE(str) { if (m->eflags&REG_TRACE) printf("=%s\n", (str)); }
125 #else
126 #define   SP(t, s, c)         /* nothing */
127 #define   AT(t, p1, p2, s1, s2)         /* nothing */
128 #define   NOTE(s)   /* nothing */
129 #endif
130 
131 /*
132  - matcher - the actual matching engine
133  == static int matcher(struct re_guts *g, RCHAR_T *string, \
134  ==       size_t nmatch, regmatch_t pmatch[], int eflags);
135  */
136 static int                              /* 0 success, REG_NOMATCH failure */
matcher(g,string,nmatch,pmatch,eflags)137 matcher(g, string, nmatch, pmatch, eflags)
138 struct re_guts *g;
139 RCHAR_T *string;
140 size_t nmatch;
141 regmatch_t pmatch[];
142 int eflags;
143 {
144           RCHAR_T *endp;
145           size_t i;
146           struct match mv;
147           struct match *m = &mv;
148           RCHAR_T *dp;
149           const sopno gf = g->firststate+1;       /* +1 for OEND */
150           const sopno gl = g->laststate;
151           RCHAR_T *start;
152           RCHAR_T *stop;
153           RCHAR_T empty[] = { REOF };
154 
155           /* Input can be a NULL pointer, treat like an empty line. */
156           if (string == NULL)
157                     string = empty;
158 
159           /* simplify the situation where possible */
160           if (g->cflags&REG_NOSUB)
161                     nmatch = 0;
162           if (eflags&REG_STARTEND) {
163                     start = string + pmatch[0].rm_so;
164                     stop = string + pmatch[0].rm_eo;
165           } else {
166                     start = string;
167                     stop = start + STRLEN(start);
168           }
169           if (stop < start)
170                     return(REG_INVARG);
171 
172           /* prescreening; this does wonders for this rather slow code */
173           if (g->must != NULL) {
174                     for (dp = start; dp < stop; dp++)
175                               if (*dp == g->must[0] && (size_t)(stop - dp) >= g->mlen &&
176                                         MEMCMP(dp, g->must, g->mlen) == 0)
177                                         break;
178                     if (dp == stop)               /* we didn't find g->must */
179                               return(REG_NOMATCH);
180           }
181 
182           /* match struct setup */
183           m->g = g;
184           m->eflags = eflags;
185           m->pmatch = NULL;
186           m->lastpos = NULL;
187           m->offp = string;
188           m->beginp = start;
189           m->endp = stop;
190           STATESETUP(m, 4);
191           SETUP(m->st);
192           SETUP(m->fresh);
193           SETUP(m->tmp);
194           SETUP(m->empty);
195           CLEAR(m->empty);
196 
197           /* this loop does only one repetition except for backrefs */
198           for (;;) {
199                     endp = fast(m, start, stop, gf, gl);
200                     if (endp == NULL) {           /* a miss */
201                               STATETEARDOWN(m);
202                               return(REG_NOMATCH);
203                     }
204                     if (nmatch == 0 && !g->backrefs)
205                               break;              /* no further info needed */
206 
207                     /* where? */
208                     assert(m->coldp != NULL);
209                     for (;;) {
210                               NOTE("finding start");
211                               endp = slow(m, m->coldp, stop, gf, gl);
212                               if (endp != NULL)
213                                         break;
214                               assert(m->coldp < m->endp);
215                               m->coldp++;
216                     }
217                     if (nmatch == 1 && !g->backrefs)
218                               break;              /* no further info needed */
219 
220                     /* oh my, he wants the subexpressions... */
221                     if (m->pmatch == NULL)
222                               m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) *
223                                                                       sizeof(regmatch_t));
224                     if (m->pmatch == NULL) {
225                               STATETEARDOWN(m);
226                               return(REG_ESPACE);
227                     }
228                     for (i = 1; i <= m->g->nsub; i++)
229                               m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1;
230                     if (!g->backrefs && !(m->eflags&REG_BACKR)) {
231                               NOTE("dissecting");
232                               dp = dissect(m, m->coldp, endp, gf, gl);
233                     } else {
234                               if (g->nplus > 0 && m->lastpos == NULL)
235                                         m->lastpos = (RCHAR_T **)malloc((g->nplus+1) *
236                                                                       sizeof(RCHAR_T *));
237                               if (g->nplus > 0 && m->lastpos == NULL) {
238                                         free(m->pmatch);
239                                         STATETEARDOWN(m);
240                                         return(REG_ESPACE);
241                               }
242                               NOTE("backref dissect");
243                               dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
244                     }
245                     if (dp != NULL)
246                               break;
247 
248                     /* uh-oh... we couldn't find a subexpression-level match */
249                     assert(g->backrefs);          /* must be back references doing it */
250                     assert(g->nplus == 0 || m->lastpos != NULL);
251                     for (;;) {
252                               if (dp != NULL || endp <= m->coldp)
253                                         break;              /* defeat */
254                               NOTE("backoff");
255                               endp = slow(m, m->coldp, endp-1, gf, gl);
256                               if (endp == NULL)
257                                         break;              /* defeat */
258                               /* try it on a shorter possibility */
259 #ifndef NDEBUG
260                               for (i = 1; i <= m->g->nsub; i++) {
261                                         assert(m->pmatch[i].rm_so == -1);
262                                         assert(m->pmatch[i].rm_eo == -1);
263                               }
264 #endif
265                               NOTE("backoff dissect");
266                               dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
267                     }
268                     assert(dp == NULL || dp == endp);
269                     if (dp != NULL)               /* found a shorter one */
270                               break;
271 
272                     /* despite initial appearances, there is no match here */
273                     NOTE("false alarm");
274                     start = m->coldp + 1;         /* recycle starting later */
275                     assert(start <= stop);
276           }
277 
278           /* fill in the details if requested */
279           if (nmatch > 0) {
280                     pmatch[0].rm_so = m->coldp - m->offp;
281                     pmatch[0].rm_eo = endp - m->offp;
282           }
283           if (nmatch > 1) {
284                     assert(m->pmatch != NULL);
285                     for (i = 1; i < nmatch; i++)
286                               if (i <= m->g->nsub)
287                                         pmatch[i] = m->pmatch[i];
288                               else {
289                                         pmatch[i].rm_so = -1;
290                                         pmatch[i].rm_eo = -1;
291                               }
292           }
293 
294           if (m->pmatch != NULL)
295                     free((char *)m->pmatch);
296           if (m->lastpos != NULL)
297                     free((char *)m->lastpos);
298           STATETEARDOWN(m);
299           return(0);
300 }
301 
302 /*
303  - dissect - figure out what matched what, no back references
304  == static RCHAR_T *dissect(struct match *m, RCHAR_T *start, \
305  ==       RCHAR_T *stop, sopno startst, sopno stopst);
306  */
307 static RCHAR_T *                        /* == stop (success) always */
dissect(m,start,stop,startst,stopst)308 dissect(m, start, stop, startst, stopst)
309 struct match *m;
310 RCHAR_T *start;
311 RCHAR_T *stop;
312 sopno startst;
313 sopno stopst;
314 {
315           int i;
316           sopno ss; /* start sop of current subRE */
317           sopno es; /* end sop of current subRE */
318           RCHAR_T *sp;        /* start of string matched by it */
319           RCHAR_T *stp;       /* string matched by it cannot pass here */
320           RCHAR_T *rest;      /* start of rest of string */
321           RCHAR_T *tail;      /* string unmatched by rest of RE */
322           sopno ssub;         /* start sop of subsubRE */
323           sopno esub;         /* end sop of subsubRE */
324           RCHAR_T *ssp;       /* start of string matched by subsubRE */
325           RCHAR_T *sep;       /* end of string matched by subsubRE */
326           RCHAR_T *oldssp;    /* previous ssp */
327           RCHAR_T *dp;
328 
329           AT("diss", start, stop, startst, stopst);
330           sp = start;
331           for (ss = startst; ss < stopst; ss = es) {
332                     /* identify end of subRE */
333                     es = ss;
334                     switch (m->g->strip[es]) {
335                     case OPLUS_:
336                     case OQUEST_:
337                               es += m->g->stripdata[es];
338                               break;
339                     case OCH_:
340                               while (m->g->strip[es] != O_CH)
341                                         es += m->g->stripdata[es];
342                               break;
343                     }
344                     es++;
345 
346                     /* figure out what it matched */
347                     switch (m->g->strip[ss]) {
348                     case OEND:
349                               assert(nope);
350                               break;
351                     case OCHAR:
352                               sp++;
353                               break;
354                     case OBOL:
355                     case OEOL:
356                     case OBOW:
357                     case OEOW:
358                               break;
359                     case OANY:
360                     case OANYOF:
361                               sp++;
362                               break;
363                     case OBACK_:
364                     case O_BACK:
365                               assert(nope);
366                               break;
367                     /* cases where length of match is hard to find */
368                     case OQUEST_:
369                               stp = stop;
370                               for (;;) {
371                                         /* how long could this one be? */
372                                         rest = slow(m, sp, stp, ss, es);
373                                         assert(rest != NULL);         /* it did match */
374                                         /* could the rest match the rest? */
375                                         tail = slow(m, rest, stop, es, stopst);
376                                         if (tail == stop)
377                                                   break;              /* yes! */
378                                         /* no -- try a shorter match for this one */
379                                         stp = rest - 1;
380                                         assert(stp >= sp);  /* it did work */
381                               }
382                               ssub = ss + 1;
383                               esub = es - 1;
384                               /* did innards match? */
385                               if (slow(m, sp, rest, ssub, esub) != NULL) {
386                                         dp = dissect(m, sp, rest, ssub, esub);
387                                         __USE(dp);
388                                         assert(dp == rest);
389                               } else              /* no */
390                                         assert(sp == rest);
391                               sp = rest;
392                               break;
393                     case OPLUS_:
394                               stp = stop;
395                               for (;;) {
396                                         /* how long could this one be? */
397                                         rest = slow(m, sp, stp, ss, es);
398                                         assert(rest != NULL);         /* it did match */
399                                         /* could the rest match the rest? */
400                                         tail = slow(m, rest, stop, es, stopst);
401                                         if (tail == stop)
402                                                   break;              /* yes! */
403                                         /* no -- try a shorter match for this one */
404                                         stp = rest - 1;
405                                         assert(stp >= sp);  /* it did work */
406                               }
407                               ssub = ss + 1;
408                               esub = es - 1;
409                               ssp = sp;
410                               oldssp = ssp;
411                               for (;;) {          /* find last match of innards */
412                                         sep = slow(m, ssp, rest, ssub, esub);
413                                         if (sep == NULL || sep == ssp)
414                                                   break;    /* failed or matched null */
415                                         oldssp = ssp;       /* on to next try */
416                                         ssp = sep;
417                               }
418                               if (sep == NULL) {
419                                         /* last successful match */
420                                         sep = ssp;
421                                         ssp = oldssp;
422                               }
423                               assert(sep == rest);          /* must exhaust substring */
424                               assert(slow(m, ssp, sep, ssub, esub) == rest);
425                               dp = dissect(m, ssp, sep, ssub, esub);
426                               assert(dp == sep);
427                               sp = rest;
428                               break;
429                     case OCH_:
430                               stp = stop;
431                               for (;;) {
432                                         /* how long could this one be? */
433                                         rest = slow(m, sp, stp, ss, es);
434                                         assert(rest != NULL);         /* it did match */
435                                         /* could the rest match the rest? */
436                                         tail = slow(m, rest, stop, es, stopst);
437                                         if (tail == stop)
438                                                   break;              /* yes! */
439                                         /* no -- try a shorter match for this one */
440                                         stp = rest - 1;
441                                         assert(stp >= sp);  /* it did work */
442                               }
443                               ssub = ss + 1;
444                               esub = ss + m->g->stripdata[ss] - 1;
445                               assert(m->g->strip[esub] == OOR1);
446                               for (;;) {          /* find first matching branch */
447                                         if (slow(m, sp, rest, ssub, esub) == rest)
448                                                   break;    /* it matched all of it */
449                                         /* that one missed, try next one */
450                                         assert(m->g->strip[esub] == OOR1);
451                                         esub++;
452                                         assert(m->g->strip[esub] == OOR2);
453                                         ssub = esub + 1;
454                                         esub += m->g->stripdata[esub];
455                                         if (m->g->strip[esub] == OOR2)
456                                                   esub--;
457                                         else
458                                                   assert(m->g->strip[esub] == O_CH);
459                               }
460                               dp = dissect(m, sp, rest, ssub, esub);
461                               assert(dp == rest);
462                               sp = rest;
463                               break;
464                     case O_PLUS:
465                     case O_QUEST:
466                     case OOR1:
467                     case OOR2:
468                     case O_CH:
469                               assert(nope);
470                               break;
471                     case OLPAREN:
472                               i = m->g->stripdata[ss];
473                               assert(0 < i && i <= m->g->nsub);
474                               m->pmatch[i].rm_so = sp - m->offp;
475                               break;
476                     case ORPAREN:
477                               i = m->g->stripdata[ss];
478                               assert(0 < i && i <= m->g->nsub);
479                               m->pmatch[i].rm_eo = sp - m->offp;
480                               break;
481                     default:            /* uh oh */
482                               assert(nope);
483                               break;
484                     }
485           }
486 
487           assert(sp == stop);
488           return(sp);
489 }
490 
491 /*
492  - backref - figure out what matched what, figuring in back references
493  == static RCHAR_T *backref(struct match *m, RCHAR_T *start, \
494  ==       RCHAR_T *stop, sopno startst, sopno stopst, sopno lev);
495  */
496 static RCHAR_T *                        /* == stop (success) or NULL (failure) */
backref(m,start,stop,startst,stopst,lev)497 backref(m, start, stop, startst, stopst, lev)
498 struct match *m;
499 RCHAR_T *start;
500 RCHAR_T *stop;
501 sopno startst;
502 sopno stopst;
503 sopno lev;                              /* PLUS nesting level */
504 {
505           int i;
506           sopno ss; /* start sop of current subRE */
507           RCHAR_T *sp;        /* start of string matched by it */
508           sopno ssub;         /* start sop of subsubRE */
509           sopno esub;         /* end sop of subsubRE */
510           RCHAR_T *ssp;       /* start of string matched by subsubRE */
511           RCHAR_T *dp;
512           size_t len;
513           int hard;
514           sop s;
515           RCHAR_T d;
516           regoff_t offsave;
517           cset *cs;
518 
519           AT("back", start, stop, startst, stopst);
520           sp = start;
521 
522           /* get as far as we can with easy stuff */
523           hard = 0;
524           for (ss = startst; !hard && ss < stopst; ss++) {
525                     s = m->g->strip[ss];
526                     d = m->g->stripdata[ss];
527                     switch (s) {
528                     case OCHAR:
529                               if (sp == stop || *sp++ != d)
530                                         return(NULL);
531                               break;
532                     case OANY:
533                               if (sp == stop)
534                                         return(NULL);
535                               sp++;
536                               break;
537                     case OANYOF:
538                               cs = &m->g->sets[d];
539                               if (sp == stop || !CHIN(cs, *sp++))
540                                         return(NULL);
541                               break;
542                     case OBOL:
543                               if ( (sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
544                                                   (sp < m->endp && *(sp-1) == '\n' &&
545                                                             (m->g->cflags&REG_NEWLINE)) )
546                                         { /* yes */ }
547                               else
548                                         return(NULL);
549                               break;
550                     case OEOL:
551                               if ( (sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
552                                                   (sp < m->endp && *sp == '\n' &&
553                                                             (m->g->cflags&REG_NEWLINE)) )
554                                         { /* yes */ }
555                               else
556                                         return(NULL);
557                               break;
558                     case OBOW:
559                               if (( (sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
560                                                   (sp < m->endp && *(sp-1) == '\n' &&
561                                                             (m->g->cflags&REG_NEWLINE)) ||
562                                                   (sp > m->beginp &&
563                                                                       !ISWORD(*(sp-1))) ) &&
564                                                   (sp < m->endp && ISWORD(*sp)) )
565                                         { /* yes */ }
566                               else
567                                         return(NULL);
568                               break;
569                     case OEOW:
570                               if (( (sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
571                                                   (sp < m->endp && *sp == '\n' &&
572                                                             (m->g->cflags&REG_NEWLINE)) ||
573                                                   (sp < m->endp && !ISWORD(*sp)) ) &&
574                                                   (sp > m->beginp && ISWORD(*(sp-1))) )
575                                         { /* yes */ }
576                               else
577                                         return(NULL);
578                               break;
579                     case O_QUEST:
580                               break;
581                     case OOR1:          /* matches null but needs to skip */
582                               ss++;
583                               s = m->g->strip[ss];
584                               d = m->g->stripdata[ss];
585                               do {
586                                         assert(s == OOR2);
587                                         ss += d;
588                                         s = m->g->strip[ss];
589                                         d = m->g->stripdata[ss];
590                               } while (s != O_CH);
591                               /* note that the ss++ gets us past the O_CH */
592                               break;
593                     default:  /* have to make a choice */
594                               hard = 1;
595                               break;
596                     }
597           }
598           if (!hard) {                  /* that was it! */
599                     if (sp != stop)
600                               return(NULL);
601                     return(sp);
602           }
603           ss--;                         /* adjust for the for's final increment */
604 
605           /* the hard stuff */
606           AT("hard", sp, stop, ss, stopst);
607           s = m->g->strip[ss];
608           d = m->g->stripdata[ss];
609           switch (s) {
610           case OBACK_:                  /* the vilest depths */
611                     i = d;
612                     assert(0 < i && i <= m->g->nsub);
613                     if (m->pmatch[i].rm_eo == -1)
614                               return(NULL);
615                     assert(m->pmatch[i].rm_so != -1);
616                     len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so;
617                     assert(stop - m->beginp >= len);
618                     if (sp > stop - len)
619                               return(NULL);       /* not enough left to match */
620                     ssp = m->offp + m->pmatch[i].rm_so;
621                     if (memcmp(sp, ssp, len) != 0)
622                               return(NULL);
623                     while (m->g->strip[ss] != O_BACK || m->g->stripdata[ss] != i)
624                               ss++;
625                     return(backref(m, sp+len, stop, ss+1, stopst, lev));
626                     break;
627           case OQUEST_:                 /* to null or not */
628                     dp = backref(m, sp, stop, ss+1, stopst, lev);
629                     if (dp != NULL)
630                               return(dp);         /* not */
631                     return(backref(m, sp, stop, ss+d+1, stopst, lev));
632                     break;
633           case OPLUS_:
634                     assert(m->lastpos != NULL);
635                     assert(lev+1 <= m->g->nplus);
636                     m->lastpos[lev+1] = sp;
637                     return(backref(m, sp, stop, ss+1, stopst, lev+1));
638                     break;
639           case O_PLUS:
640                     if (sp == m->lastpos[lev])    /* last pass matched null */
641                               return(backref(m, sp, stop, ss+1, stopst, lev-1));
642                     /* try another pass */
643                     m->lastpos[lev] = sp;
644                     dp = backref(m, sp, stop, ss-d+1, stopst, lev);
645                     if (dp == NULL)
646                               return(backref(m, sp, stop, ss+1, stopst, lev-1));
647                     else
648                               return(dp);
649                     break;
650           case OCH_:                    /* find the right one, if any */
651                     ssub = ss + 1;
652                     esub = ss + d - 1;
653                     assert(m->g->strip[esub] == OOR1);
654                     for (;;) {          /* find first matching branch */
655                               dp = backref(m, sp, stop, ssub, esub, lev);
656                               if (dp != NULL)
657                                         return(dp);
658                               /* that one missed, try next one */
659                               if (m->g->strip[esub] == O_CH)
660                                         return(NULL);       /* there is none */
661                               esub++;
662                               assert(m->g->strip[esub] == OOR2);
663                               ssub = esub + 1;
664                               esub += m->g->stripdata[esub];
665                               if (m->g->strip[esub] == OOR2)
666                                         esub--;
667                               else
668                                         assert(m->g->strip[esub] == O_CH);
669                     }
670                     break;
671           case OLPAREN:                 /* must undo assignment if rest fails */
672                     i = d;
673                     assert(0 < i && i <= m->g->nsub);
674                     offsave = m->pmatch[i].rm_so;
675                     m->pmatch[i].rm_so = sp - m->offp;
676                     dp = backref(m, sp, stop, ss+1, stopst, lev);
677                     if (dp != NULL)
678                               return(dp);
679                     m->pmatch[i].rm_so = offsave;
680                     return(NULL);
681                     break;
682           case ORPAREN:                 /* must undo assignment if rest fails */
683                     i = d;
684                     assert(0 < i && i <= m->g->nsub);
685                     offsave = m->pmatch[i].rm_eo;
686                     m->pmatch[i].rm_eo = sp - m->offp;
687                     dp = backref(m, sp, stop, ss+1, stopst, lev);
688                     if (dp != NULL)
689                               return(dp);
690                     m->pmatch[i].rm_eo = offsave;
691                     return(NULL);
692                     break;
693           default:            /* uh oh */
694                     assert(nope);
695                     break;
696           }
697 
698           /* "can't happen" */
699           assert(nope);
700           /* NOTREACHED */
701           return NULL;
702 }
703 
704 /*
705  - fast - step through the string at top speed
706  == static RCHAR_T *fast(struct match *m, RCHAR_T *start, \
707  ==       RCHAR_T *stop, sopno startst, sopno stopst);
708  */
709 static RCHAR_T *                        /* where tentative match ended, or NULL */
fast(m,start,stop,startst,stopst)710 fast(m, start, stop, startst, stopst)
711 struct match *m;
712 RCHAR_T *start;
713 RCHAR_T *stop;
714 sopno startst;
715 sopno stopst;
716 {
717           states st = m->st;
718           states fresh = m->fresh;
719           states tmp = m->tmp;
720           RCHAR_T *p = start;
721           RCHAR_T c = (start == m->beginp) ? OUT : *(start-1);
722           RCHAR_T lastc;      /* previous c */
723           int flag;
724           int i;
725           RCHAR_T *coldp;     /* last p after which no match was underway */
726 
727           CLEAR(st);
728           SET1(st, startst);
729           st = step(m->g, startst, stopst, st, NOTHING, OUT, st);
730           ASSIGN(fresh, st);
731           SP("start", st, *p);
732           coldp = NULL;
733           for (;;) {
734                     /* next character */
735                     lastc = c;
736                     c = (p == m->endp) ? OUT : *p;
737                     if (EQ(st, fresh))
738                               coldp = p;
739 
740                     /* is there an EOL and/or BOL between lastc and c? */
741                     flag = 0;
742                     i = 0;
743                     if ( (lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
744                                         (lastc == OUT && !(m->eflags&REG_NOTBOL)) ) {
745                               flag = BOL;
746                               i = m->g->nbol;
747                     }
748                     if ( (c == '\n' && m->g->cflags&REG_NEWLINE) ||
749                                         (c == OUT && !(m->eflags&REG_NOTEOL)) ) {
750                               flag = (flag == BOL) ? BOLEOL : EOL;
751                               i += m->g->neol;
752                     }
753                     if (i != 0) {
754                               for (; i > 0; i--)
755                                         st = step(m->g, startst, stopst, st, flag, OUT, st);
756                               SP("boleol", st, c);
757                     }
758 
759                     /* how about a word boundary? */
760                     if ( (flag == BOL || (lastc != OUT && !ISWORD(lastc))) &&
761                                                   (c != OUT && ISWORD(c)) ) {
762                               flag = BOW;
763                     }
764                     if ( (lastc != OUT && ISWORD(lastc)) &&
765                                         (flag == EOL || (c != OUT && !ISWORD(c))) ) {
766                               flag = EOW;
767                     }
768                     if (flag == BOW || flag == EOW) {
769                               st = step(m->g, startst, stopst, st, flag, OUT, st);
770                               SP("boweow", st, c);
771                     }
772 
773                     /* are we done? */
774                     if (ISSET(st, stopst) || p == stop)
775                               break;              /* NOTE BREAK OUT */
776 
777                     /* no, we must deal with this character */
778                     ASSIGN(tmp, st);
779                     ASSIGN(st, fresh);
780                     assert(c != OUT);
781                     st = step(m->g, startst, stopst, tmp, 0, c, st);
782                     SP("aft", st, c);
783                     assert(EQ(step(m->g, startst, stopst, st, NOTHING, OUT, st), st));
784                     p++;
785           }
786 
787           assert(coldp != NULL);
788           m->coldp = coldp;
789           if (ISSET(st, stopst))
790                     return(p+1);
791           else
792                     return(NULL);
793 }
794 
795 /*
796  - slow - step through the string more deliberately
797  == static RCHAR_T *slow(struct match *m, RCHAR_T *start, \
798  ==       RCHAR_T *stop, sopno startst, sopno stopst);
799  */
800 static RCHAR_T *                        /* where it ended */
slow(m,start,stop,startst,stopst)801 slow(m, start, stop, startst, stopst)
802 struct match *m;
803 RCHAR_T *start;
804 RCHAR_T *stop;
805 sopno startst;
806 sopno stopst;
807 {
808           states st = m->st;
809           states empty = m->empty;
810           states tmp = m->tmp;
811           RCHAR_T *p = start;
812           RCHAR_T c = (start == m->beginp) ? OUT : *(start-1);
813           RCHAR_T lastc;      /* previous c */
814           int flag;
815           int i;
816           RCHAR_T *matchp;    /* last p at which a match ended */
817 
818           AT("slow", start, stop, startst, stopst);
819           CLEAR(st);
820           SET1(st, startst);
821           SP("sstart", st, *p);
822           st = step(m->g, startst, stopst, st, NOTHING, OUT, st);
823           matchp = NULL;
824           for (;;) {
825                     /* next character */
826                     lastc = c;
827                     c = (p == m->endp) ? OUT : *p;
828 
829                     /* is there an EOL and/or BOL between lastc and c? */
830                     flag = 0;
831                     i = 0;
832                     if ( (lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
833                                         (lastc == OUT && !(m->eflags&REG_NOTBOL)) ) {
834                               flag = BOL;
835                               i = m->g->nbol;
836                     }
837                     if ( (c == '\n' && m->g->cflags&REG_NEWLINE) ||
838                                         (c == OUT && !(m->eflags&REG_NOTEOL)) ) {
839                               flag = (flag == BOL) ? BOLEOL : EOL;
840                               i += m->g->neol;
841                     }
842                     if (i != 0) {
843                               for (; i > 0; i--)
844                                         st = step(m->g, startst, stopst, st, flag, OUT, st);
845                               SP("sboleol", st, c);
846                     }
847 
848                     /* how about a word boundary? */
849                     if ( (flag == BOL || (lastc != OUT && !ISWORD(lastc))) &&
850                                                   (c != OUT && ISWORD(c)) ) {
851                               flag = BOW;
852                     }
853                     if ( (lastc != OUT && ISWORD(lastc)) &&
854                                         (flag == EOL || (c != OUT && !ISWORD(c))) ) {
855                               flag = EOW;
856                     }
857                     if (flag == BOW || flag == EOW) {
858                               st = step(m->g, startst, stopst, st, flag, OUT, st);
859                               SP("sboweow", st, c);
860                     }
861 
862                     /* are we done? */
863                     if (ISSET(st, stopst))
864                               matchp = p;
865                     if (EQ(st, empty) || p == stop)
866                               break;              /* NOTE BREAK OUT */
867 
868                     /* no, we must deal with this character */
869                     ASSIGN(tmp, st);
870                     ASSIGN(st, empty);
871                     assert(c != OUT);
872                     st = step(m->g, startst, stopst, tmp, 0, c, st);
873                     SP("saft", st, c);
874                     assert(EQ(step(m->g, startst, stopst, st, NOTHING, OUT, st), st));
875                     p++;
876           }
877 
878           return(matchp);
879 }
880 
881 
882 /*
883  - step - map set of states reachable before char to set reachable after
884  == static states step(struct re_guts *g, sopno start, sopno stop, \
885  ==       states bef, int flag, RCHAR_T ch, states aft);
886  == #define         BOL       (1)
887  == #define         EOL       (BOL+1)
888  == #define         BOLEOL    (BOL+2)
889  == #define         NOTHING   (BOL+3)
890  == #define         BOW       (BOL+4)
891  == #define         EOW       (BOL+5)
892  */
893 static states
step(g,start,stop,bef,flag,ch,aft)894 step(g, start, stop, bef, flag, ch, aft)
895 struct re_guts *g;
896 sopno start;                            /* start state within strip */
897 sopno stop;                             /* state after stop state within strip */
898 states bef;                   /* states reachable before */
899 int flag;                     /* NONCHAR flag */
900 RCHAR_T ch;                             /* character code */
901 states aft;                   /* states already known reachable after */
902 {
903           cset *cs;
904           sop s;
905           RCHAR_T d;
906           sopno pc;
907           onestate here;                /* note, macros know this name */
908           sopno look;
909           int i;
910 
911           for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) {
912                     s = g->strip[pc];
913                     d = g->stripdata[pc];
914                     switch (s) {
915                     case OEND:
916                               assert(pc == stop-1);
917                               break;
918                     case OCHAR:
919                               /* only characters can match */
920                               assert(!flag || ch != d);
921                               if (ch == d)
922                                         FWD(aft, bef, 1);
923                               break;
924                     case OBOL:
925                               if (flag == BOL || flag == BOLEOL)
926                                         FWD(aft, bef, 1);
927                               break;
928                     case OEOL:
929                               if (flag == EOL || flag == BOLEOL)
930                                         FWD(aft, bef, 1);
931                               break;
932                     case OBOW:
933                               if (flag == BOW)
934                                         FWD(aft, bef, 1);
935                               break;
936                     case OEOW:
937                               if (flag == EOW)
938                                         FWD(aft, bef, 1);
939                               break;
940                     case OANY:
941                               if (!flag)
942                                         FWD(aft, bef, 1);
943                               break;
944                     case OANYOF:
945                               cs = &g->sets[d];
946                               if (!flag && CHIN(cs, ch))
947                                         FWD(aft, bef, 1);
948                               break;
949                     case OBACK_:                  /* ignored here */
950                     case O_BACK:
951                               FWD(aft, aft, 1);
952                               break;
953                     case OPLUS_:                  /* forward, this is just an empty */
954                               FWD(aft, aft, 1);
955                               break;
956                     case O_PLUS:                  /* both forward and back */
957                               FWD(aft, aft, 1);
958                               i = ISSETBACK(aft, d);
959                               BACK(aft, aft, d);
960                               if (!i && ISSETBACK(aft, d)) {
961                                         /* oho, must reconsider loop body */
962                                         pc -= d + 1;
963                                         INIT(here, pc);
964                               }
965                               break;
966                     case OQUEST_:                 /* two branches, both forward */
967                               FWD(aft, aft, 1);
968                               FWD(aft, aft, d);
969                               break;
970                     case O_QUEST:                 /* just an empty */
971                               FWD(aft, aft, 1);
972                               break;
973                     case OLPAREN:                 /* not significant here */
974                     case ORPAREN:
975                               FWD(aft, aft, 1);
976                               break;
977                     case OCH_:                    /* mark the first two branches */
978                               FWD(aft, aft, 1);
979                               assert(OP(g->strip[pc+d]) == OOR2);
980                               FWD(aft, aft, d);
981                               break;
982                     case OOR1:                    /* done a branch, find the O_CH */
983                               if (ISSTATEIN(aft, here)) {
984                                         for (look = 1; /**/; look += d) {
985                                                   s = g->strip[pc+look];
986                                                   d = g->stripdata[pc+look];
987                                                   if (s == O_CH)
988                                                             break;
989                                                   assert(s == OOR2);
990                                         }
991                                         FWD(aft, aft, look);
992                               }
993                               break;
994                     case OOR2:                    /* propagate OCH_'s marking */
995                               FWD(aft, aft, 1);
996                               if (g->strip[pc+d] != O_CH) {
997                                         assert(g->strip[pc+d] == OOR2);
998                                         FWD(aft, aft, d);
999                               }
1000                               break;
1001                     case O_CH:                    /* just empty */
1002                               FWD(aft, aft, 1);
1003                               break;
1004                     default:            /* ooooops... */
1005                               assert(nope);
1006                               break;
1007                     }
1008           }
1009 
1010           return(aft);
1011 }
1012 
1013 #ifdef REDEBUG
1014 /*
1015  - print - print a set of states
1016  == #ifdef REDEBUG
1017  == static void print(struct match *m, char *caption, states st, \
1018  ==       int ch, FILE *d);
1019  == #endif
1020  */
1021 static void
print(m,caption,st,ch,d)1022 print(m, caption, st, ch, d)
1023 struct match *m;
1024 char *caption;
1025 states st;
1026 int ch;
1027 FILE *d;
1028 {
1029           struct re_guts *g = m->g;
1030           int i;
1031           int first = 1;
1032 
1033           if (!(m->eflags&REG_TRACE))
1034                     return;
1035 
1036           fprintf(d, "%s", caption);
1037           if (ch != '\0')
1038                     fprintf(d, " %s", pchar(ch));
1039           for (i = 0; i < g->nstates; i++)
1040                     if (ISSET(st, i)) {
1041                               fprintf(d, "%s%d", (first) ? "\t" : ", ", i);
1042                               first = 0;
1043                     }
1044           fprintf(d, "\n");
1045 }
1046 
1047 /*
1048  - at - print current situation
1049  == #ifdef REDEBUG
1050  == static void at(struct match *m, char *title, char *start, char *stop, \
1051  ==                                                         sopno startst, sopno stopst);
1052  == #endif
1053  */
1054 static void
at(m,title,start,stop,startst,stopst)1055 at(m, title, start, stop, startst, stopst)
1056 struct match *m;
1057 char *title;
1058 char *start;
1059 char *stop;
1060 sopno startst;
1061 sopno stopst;
1062 {
1063           if (!(m->eflags&REG_TRACE))
1064                     return;
1065 
1066           printf("%s %s-", title, pchar(*start));
1067           printf("%s ", pchar(*stop));
1068           printf("%ld-%ld\n", (long)startst, (long)stopst);
1069 }
1070 
1071 #ifndef PCHARDONE
1072 #define   PCHARDONE /* never again */
1073 /*
1074  - pchar - make a character printable
1075  == #ifdef REDEBUG
1076  == static char *pchar(int ch);
1077  == #endif
1078  *
1079  * Is this identical to regchar() over in debug.c?  Well, yes.  But a
1080  * duplicate here avoids having a debugging-capable regexec.o tied to
1081  * a matching debug.o, and this is convenient.  It all disappears in
1082  * the non-debug compilation anyway, so it doesn't matter much.
1083  */
1084 static char *                           /* -> representation */
pchar(ch)1085 pchar(ch)
1086 int ch;
1087 {
1088           static char pbuf[10];
1089 
1090           if (isprint(ch) || ch == ' ')
1091                     sprintf(pbuf, "%c", ch);
1092           else
1093                     sprintf(pbuf, "\\%o", ch);
1094           return(pbuf);
1095 }
1096 #endif
1097 #endif
1098 
1099 #undef    matcher
1100 #undef    fast
1101 #undef    slow
1102 #undef    dissect
1103 #undef    backref
1104 #undef    step
1105 #undef    print
1106 #undef    at
1107 #undef    match
1108