1 /* $NetBSD: engine.c,v 1.4 2024/11/10 10:54:29 mlelstv Exp $ */
2 /*-
3 * Copyright (c) 1992, 1993, 1994 Henry Spencer.
4 * Copyright (c) 1992, 1993, 1994
5 * The Regents of the University of California. All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * Henry Spencer of the University of Toronto.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the University of
21 * California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 * @(#)engine.c 8.4 (Berkeley) 3/19/94
39 */
40
41 /*
42 * The matching engine and friends. This file is #included by regexec.c
43 * after suitable #defines of a variety of macros used herein, so that
44 * different state representations can be used without duplicating masses
45 * of code.
46 */
47
48 #ifdef SNAMES
49 #define matcher smatcher
50 #define fast sfast
51 #define slow sslow
52 #define dissect sdissect
53 #define backref sbackref
54 #define step sstep
55 #define print sprint
56 #define at sat
57 #define match smat
58 #endif
59 #ifdef LNAMES
60 #define matcher lmatcher
61 #define fast lfast
62 #define slow lslow
63 #define dissect ldissect
64 #define backref lbackref
65 #define step lstep
66 #define print lprint
67 #define at lat
68 #define match lmat
69 #endif
70
71 /* another structure passed up and down to avoid zillions of parameters */
72 struct match {
73 struct re_guts *g;
74 int eflags;
75 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */
76 RCHAR_T *offp; /* offsets work from here */
77 RCHAR_T *beginp; /* start of string -- virtual NUL precedes */
78 RCHAR_T *endp; /* end of string -- virtual NUL here */
79 RCHAR_T *coldp; /* can be no match starting before here */
80 RCHAR_T **lastpos; /* [nplus+1] */
81 STATEVARS;
82 states st; /* current states */
83 states fresh; /* states for a fresh start */
84 states tmp; /* temporary */
85 states empty; /* empty set of states */
86 };
87
88 /* ========= begin header generated by ./mkh ========= */
89 #ifdef __cplusplus
90 extern "C" {
91 #endif
92
93 /* === engine.c === */
94 static int matcher __P((struct re_guts *g, RCHAR_T *string, size_t nmatch, regmatch_t pmatch[], int eflags));
95 static RCHAR_T *dissect __P((struct match *m, RCHAR_T *start, RCHAR_T *stop, sopno startst, sopno stopst));
96 static RCHAR_T *backref __P((struct match *m, RCHAR_T *start, RCHAR_T *stop, sopno startst, sopno stopst, sopno lev));
97 static RCHAR_T *fast __P((struct match *m, RCHAR_T *start, RCHAR_T *stop, sopno startst, sopno stopst));
98 static RCHAR_T *slow __P((struct match *m, RCHAR_T *start, RCHAR_T *stop, sopno startst, sopno stopst));
99 static states step __P((struct re_guts *g, sopno start, sopno stop, states bef, int flag, RCHAR_T ch, states aft));
100 #define BOL (1)
101 #define EOL (BOL+1)
102 #define BOLEOL (BOL+2)
103 #define NOTHING (BOL+3)
104 #define BOW (BOL+4)
105 #define EOW (BOL+5)
106 #ifdef REDEBUG
107 static void print __P((struct match *m, char *caption, states st, int ch, FILE *d));
108 #endif
109 #ifdef REDEBUG
110 static void at __P((struct match *m, char *title, char *start, char *stop, sopno startst, sopno stopst));
111 #endif
112 #ifdef REDEBUG
113 static char *pchar __P((int ch));
114 #endif
115
116 #ifdef __cplusplus
117 }
118 #endif
119 /* ========= end header generated by ./mkh ========= */
120
121 #ifdef REDEBUG
122 #define SP(t, s, c) print(m, t, s, c, stdout)
123 #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2)
124 #define NOTE(str) { if (m->eflags®_TRACE) printf("=%s\n", (str)); }
125 #else
126 #define SP(t, s, c) /* nothing */
127 #define AT(t, p1, p2, s1, s2) /* nothing */
128 #define NOTE(s) /* nothing */
129 #endif
130
131 /*
132 - matcher - the actual matching engine
133 == static int matcher(struct re_guts *g, RCHAR_T *string, \
134 == size_t nmatch, regmatch_t pmatch[], int eflags);
135 */
136 static int /* 0 success, REG_NOMATCH failure */
matcher(g,string,nmatch,pmatch,eflags)137 matcher(g, string, nmatch, pmatch, eflags)
138 struct re_guts *g;
139 RCHAR_T *string;
140 size_t nmatch;
141 regmatch_t pmatch[];
142 int eflags;
143 {
144 RCHAR_T *endp;
145 size_t i;
146 struct match mv;
147 struct match *m = &mv;
148 RCHAR_T *dp;
149 const sopno gf = g->firststate+1; /* +1 for OEND */
150 const sopno gl = g->laststate;
151 RCHAR_T *start;
152 RCHAR_T *stop;
153 RCHAR_T empty[] = { REOF };
154
155 /* Input can be a NULL pointer, treat like an empty line. */
156 if (string == NULL)
157 string = empty;
158
159 /* simplify the situation where possible */
160 if (g->cflags®_NOSUB)
161 nmatch = 0;
162 if (eflags®_STARTEND) {
163 start = string + pmatch[0].rm_so;
164 stop = string + pmatch[0].rm_eo;
165 } else {
166 start = string;
167 stop = start + STRLEN(start);
168 }
169 if (stop < start)
170 return(REG_INVARG);
171
172 /* prescreening; this does wonders for this rather slow code */
173 if (g->must != NULL) {
174 for (dp = start; dp < stop; dp++)
175 if (*dp == g->must[0] && (size_t)(stop - dp) >= g->mlen &&
176 MEMCMP(dp, g->must, g->mlen) == 0)
177 break;
178 if (dp == stop) /* we didn't find g->must */
179 return(REG_NOMATCH);
180 }
181
182 /* match struct setup */
183 m->g = g;
184 m->eflags = eflags;
185 m->pmatch = NULL;
186 m->lastpos = NULL;
187 m->offp = string;
188 m->beginp = start;
189 m->endp = stop;
190 STATESETUP(m, 4);
191 SETUP(m->st);
192 SETUP(m->fresh);
193 SETUP(m->tmp);
194 SETUP(m->empty);
195 CLEAR(m->empty);
196
197 /* this loop does only one repetition except for backrefs */
198 for (;;) {
199 endp = fast(m, start, stop, gf, gl);
200 if (endp == NULL) { /* a miss */
201 STATETEARDOWN(m);
202 return(REG_NOMATCH);
203 }
204 if (nmatch == 0 && !g->backrefs)
205 break; /* no further info needed */
206
207 /* where? */
208 assert(m->coldp != NULL);
209 for (;;) {
210 NOTE("finding start");
211 endp = slow(m, m->coldp, stop, gf, gl);
212 if (endp != NULL)
213 break;
214 assert(m->coldp < m->endp);
215 m->coldp++;
216 }
217 if (nmatch == 1 && !g->backrefs)
218 break; /* no further info needed */
219
220 /* oh my, he wants the subexpressions... */
221 if (m->pmatch == NULL)
222 m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) *
223 sizeof(regmatch_t));
224 if (m->pmatch == NULL) {
225 STATETEARDOWN(m);
226 return(REG_ESPACE);
227 }
228 for (i = 1; i <= m->g->nsub; i++)
229 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1;
230 if (!g->backrefs && !(m->eflags®_BACKR)) {
231 NOTE("dissecting");
232 dp = dissect(m, m->coldp, endp, gf, gl);
233 } else {
234 if (g->nplus > 0 && m->lastpos == NULL)
235 m->lastpos = (RCHAR_T **)malloc((g->nplus+1) *
236 sizeof(RCHAR_T *));
237 if (g->nplus > 0 && m->lastpos == NULL) {
238 free(m->pmatch);
239 STATETEARDOWN(m);
240 return(REG_ESPACE);
241 }
242 NOTE("backref dissect");
243 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
244 }
245 if (dp != NULL)
246 break;
247
248 /* uh-oh... we couldn't find a subexpression-level match */
249 assert(g->backrefs); /* must be back references doing it */
250 assert(g->nplus == 0 || m->lastpos != NULL);
251 for (;;) {
252 if (dp != NULL || endp <= m->coldp)
253 break; /* defeat */
254 NOTE("backoff");
255 endp = slow(m, m->coldp, endp-1, gf, gl);
256 if (endp == NULL)
257 break; /* defeat */
258 /* try it on a shorter possibility */
259 #ifndef NDEBUG
260 for (i = 1; i <= m->g->nsub; i++) {
261 assert(m->pmatch[i].rm_so == -1);
262 assert(m->pmatch[i].rm_eo == -1);
263 }
264 #endif
265 NOTE("backoff dissect");
266 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
267 }
268 assert(dp == NULL || dp == endp);
269 if (dp != NULL) /* found a shorter one */
270 break;
271
272 /* despite initial appearances, there is no match here */
273 NOTE("false alarm");
274 start = m->coldp + 1; /* recycle starting later */
275 assert(start <= stop);
276 }
277
278 /* fill in the details if requested */
279 if (nmatch > 0) {
280 pmatch[0].rm_so = m->coldp - m->offp;
281 pmatch[0].rm_eo = endp - m->offp;
282 }
283 if (nmatch > 1) {
284 assert(m->pmatch != NULL);
285 for (i = 1; i < nmatch; i++)
286 if (i <= m->g->nsub)
287 pmatch[i] = m->pmatch[i];
288 else {
289 pmatch[i].rm_so = -1;
290 pmatch[i].rm_eo = -1;
291 }
292 }
293
294 if (m->pmatch != NULL)
295 free((char *)m->pmatch);
296 if (m->lastpos != NULL)
297 free((char *)m->lastpos);
298 STATETEARDOWN(m);
299 return(0);
300 }
301
302 /*
303 - dissect - figure out what matched what, no back references
304 == static RCHAR_T *dissect(struct match *m, RCHAR_T *start, \
305 == RCHAR_T *stop, sopno startst, sopno stopst);
306 */
307 static RCHAR_T * /* == stop (success) always */
dissect(m,start,stop,startst,stopst)308 dissect(m, start, stop, startst, stopst)
309 struct match *m;
310 RCHAR_T *start;
311 RCHAR_T *stop;
312 sopno startst;
313 sopno stopst;
314 {
315 int i;
316 sopno ss; /* start sop of current subRE */
317 sopno es; /* end sop of current subRE */
318 RCHAR_T *sp; /* start of string matched by it */
319 RCHAR_T *stp; /* string matched by it cannot pass here */
320 RCHAR_T *rest; /* start of rest of string */
321 RCHAR_T *tail; /* string unmatched by rest of RE */
322 sopno ssub; /* start sop of subsubRE */
323 sopno esub; /* end sop of subsubRE */
324 RCHAR_T *ssp; /* start of string matched by subsubRE */
325 RCHAR_T *sep; /* end of string matched by subsubRE */
326 RCHAR_T *oldssp; /* previous ssp */
327 RCHAR_T *dp;
328
329 AT("diss", start, stop, startst, stopst);
330 sp = start;
331 for (ss = startst; ss < stopst; ss = es) {
332 /* identify end of subRE */
333 es = ss;
334 switch (m->g->strip[es]) {
335 case OPLUS_:
336 case OQUEST_:
337 es += m->g->stripdata[es];
338 break;
339 case OCH_:
340 while (m->g->strip[es] != O_CH)
341 es += m->g->stripdata[es];
342 break;
343 }
344 es++;
345
346 /* figure out what it matched */
347 switch (m->g->strip[ss]) {
348 case OEND:
349 assert(nope);
350 break;
351 case OCHAR:
352 sp++;
353 break;
354 case OBOL:
355 case OEOL:
356 case OBOW:
357 case OEOW:
358 break;
359 case OANY:
360 case OANYOF:
361 sp++;
362 break;
363 case OBACK_:
364 case O_BACK:
365 assert(nope);
366 break;
367 /* cases where length of match is hard to find */
368 case OQUEST_:
369 stp = stop;
370 for (;;) {
371 /* how long could this one be? */
372 rest = slow(m, sp, stp, ss, es);
373 assert(rest != NULL); /* it did match */
374 /* could the rest match the rest? */
375 tail = slow(m, rest, stop, es, stopst);
376 if (tail == stop)
377 break; /* yes! */
378 /* no -- try a shorter match for this one */
379 stp = rest - 1;
380 assert(stp >= sp); /* it did work */
381 }
382 ssub = ss + 1;
383 esub = es - 1;
384 /* did innards match? */
385 if (slow(m, sp, rest, ssub, esub) != NULL) {
386 dp = dissect(m, sp, rest, ssub, esub);
387 __USE(dp);
388 assert(dp == rest);
389 } else /* no */
390 assert(sp == rest);
391 sp = rest;
392 break;
393 case OPLUS_:
394 stp = stop;
395 for (;;) {
396 /* how long could this one be? */
397 rest = slow(m, sp, stp, ss, es);
398 assert(rest != NULL); /* it did match */
399 /* could the rest match the rest? */
400 tail = slow(m, rest, stop, es, stopst);
401 if (tail == stop)
402 break; /* yes! */
403 /* no -- try a shorter match for this one */
404 stp = rest - 1;
405 assert(stp >= sp); /* it did work */
406 }
407 ssub = ss + 1;
408 esub = es - 1;
409 ssp = sp;
410 oldssp = ssp;
411 for (;;) { /* find last match of innards */
412 sep = slow(m, ssp, rest, ssub, esub);
413 if (sep == NULL || sep == ssp)
414 break; /* failed or matched null */
415 oldssp = ssp; /* on to next try */
416 ssp = sep;
417 }
418 if (sep == NULL) {
419 /* last successful match */
420 sep = ssp;
421 ssp = oldssp;
422 }
423 assert(sep == rest); /* must exhaust substring */
424 assert(slow(m, ssp, sep, ssub, esub) == rest);
425 dp = dissect(m, ssp, sep, ssub, esub);
426 assert(dp == sep);
427 sp = rest;
428 break;
429 case OCH_:
430 stp = stop;
431 for (;;) {
432 /* how long could this one be? */
433 rest = slow(m, sp, stp, ss, es);
434 assert(rest != NULL); /* it did match */
435 /* could the rest match the rest? */
436 tail = slow(m, rest, stop, es, stopst);
437 if (tail == stop)
438 break; /* yes! */
439 /* no -- try a shorter match for this one */
440 stp = rest - 1;
441 assert(stp >= sp); /* it did work */
442 }
443 ssub = ss + 1;
444 esub = ss + m->g->stripdata[ss] - 1;
445 assert(m->g->strip[esub] == OOR1);
446 for (;;) { /* find first matching branch */
447 if (slow(m, sp, rest, ssub, esub) == rest)
448 break; /* it matched all of it */
449 /* that one missed, try next one */
450 assert(m->g->strip[esub] == OOR1);
451 esub++;
452 assert(m->g->strip[esub] == OOR2);
453 ssub = esub + 1;
454 esub += m->g->stripdata[esub];
455 if (m->g->strip[esub] == OOR2)
456 esub--;
457 else
458 assert(m->g->strip[esub] == O_CH);
459 }
460 dp = dissect(m, sp, rest, ssub, esub);
461 assert(dp == rest);
462 sp = rest;
463 break;
464 case O_PLUS:
465 case O_QUEST:
466 case OOR1:
467 case OOR2:
468 case O_CH:
469 assert(nope);
470 break;
471 case OLPAREN:
472 i = m->g->stripdata[ss];
473 assert(0 < i && i <= m->g->nsub);
474 m->pmatch[i].rm_so = sp - m->offp;
475 break;
476 case ORPAREN:
477 i = m->g->stripdata[ss];
478 assert(0 < i && i <= m->g->nsub);
479 m->pmatch[i].rm_eo = sp - m->offp;
480 break;
481 default: /* uh oh */
482 assert(nope);
483 break;
484 }
485 }
486
487 assert(sp == stop);
488 return(sp);
489 }
490
491 /*
492 - backref - figure out what matched what, figuring in back references
493 == static RCHAR_T *backref(struct match *m, RCHAR_T *start, \
494 == RCHAR_T *stop, sopno startst, sopno stopst, sopno lev);
495 */
496 static RCHAR_T * /* == stop (success) or NULL (failure) */
backref(m,start,stop,startst,stopst,lev)497 backref(m, start, stop, startst, stopst, lev)
498 struct match *m;
499 RCHAR_T *start;
500 RCHAR_T *stop;
501 sopno startst;
502 sopno stopst;
503 sopno lev; /* PLUS nesting level */
504 {
505 int i;
506 sopno ss; /* start sop of current subRE */
507 RCHAR_T *sp; /* start of string matched by it */
508 sopno ssub; /* start sop of subsubRE */
509 sopno esub; /* end sop of subsubRE */
510 RCHAR_T *ssp; /* start of string matched by subsubRE */
511 RCHAR_T *dp;
512 size_t len;
513 int hard;
514 sop s;
515 RCHAR_T d;
516 regoff_t offsave;
517 cset *cs;
518
519 AT("back", start, stop, startst, stopst);
520 sp = start;
521
522 /* get as far as we can with easy stuff */
523 hard = 0;
524 for (ss = startst; !hard && ss < stopst; ss++) {
525 s = m->g->strip[ss];
526 d = m->g->stripdata[ss];
527 switch (s) {
528 case OCHAR:
529 if (sp == stop || *sp++ != d)
530 return(NULL);
531 break;
532 case OANY:
533 if (sp == stop)
534 return(NULL);
535 sp++;
536 break;
537 case OANYOF:
538 cs = &m->g->sets[d];
539 if (sp == stop || !CHIN(cs, *sp++))
540 return(NULL);
541 break;
542 case OBOL:
543 if ( (sp == m->beginp && !(m->eflags®_NOTBOL)) ||
544 (sp < m->endp && *(sp-1) == '\n' &&
545 (m->g->cflags®_NEWLINE)) )
546 { /* yes */ }
547 else
548 return(NULL);
549 break;
550 case OEOL:
551 if ( (sp == m->endp && !(m->eflags®_NOTEOL)) ||
552 (sp < m->endp && *sp == '\n' &&
553 (m->g->cflags®_NEWLINE)) )
554 { /* yes */ }
555 else
556 return(NULL);
557 break;
558 case OBOW:
559 if (( (sp == m->beginp && !(m->eflags®_NOTBOL)) ||
560 (sp < m->endp && *(sp-1) == '\n' &&
561 (m->g->cflags®_NEWLINE)) ||
562 (sp > m->beginp &&
563 !ISWORD(*(sp-1))) ) &&
564 (sp < m->endp && ISWORD(*sp)) )
565 { /* yes */ }
566 else
567 return(NULL);
568 break;
569 case OEOW:
570 if (( (sp == m->endp && !(m->eflags®_NOTEOL)) ||
571 (sp < m->endp && *sp == '\n' &&
572 (m->g->cflags®_NEWLINE)) ||
573 (sp < m->endp && !ISWORD(*sp)) ) &&
574 (sp > m->beginp && ISWORD(*(sp-1))) )
575 { /* yes */ }
576 else
577 return(NULL);
578 break;
579 case O_QUEST:
580 break;
581 case OOR1: /* matches null but needs to skip */
582 ss++;
583 s = m->g->strip[ss];
584 d = m->g->stripdata[ss];
585 do {
586 assert(s == OOR2);
587 ss += d;
588 s = m->g->strip[ss];
589 d = m->g->stripdata[ss];
590 } while (s != O_CH);
591 /* note that the ss++ gets us past the O_CH */
592 break;
593 default: /* have to make a choice */
594 hard = 1;
595 break;
596 }
597 }
598 if (!hard) { /* that was it! */
599 if (sp != stop)
600 return(NULL);
601 return(sp);
602 }
603 ss--; /* adjust for the for's final increment */
604
605 /* the hard stuff */
606 AT("hard", sp, stop, ss, stopst);
607 s = m->g->strip[ss];
608 d = m->g->stripdata[ss];
609 switch (s) {
610 case OBACK_: /* the vilest depths */
611 i = d;
612 assert(0 < i && i <= m->g->nsub);
613 if (m->pmatch[i].rm_eo == -1)
614 return(NULL);
615 assert(m->pmatch[i].rm_so != -1);
616 len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so;
617 assert(stop - m->beginp >= len);
618 if (sp > stop - len)
619 return(NULL); /* not enough left to match */
620 ssp = m->offp + m->pmatch[i].rm_so;
621 if (memcmp(sp, ssp, len) != 0)
622 return(NULL);
623 while (m->g->strip[ss] != O_BACK || m->g->stripdata[ss] != i)
624 ss++;
625 return(backref(m, sp+len, stop, ss+1, stopst, lev));
626 break;
627 case OQUEST_: /* to null or not */
628 dp = backref(m, sp, stop, ss+1, stopst, lev);
629 if (dp != NULL)
630 return(dp); /* not */
631 return(backref(m, sp, stop, ss+d+1, stopst, lev));
632 break;
633 case OPLUS_:
634 assert(m->lastpos != NULL);
635 assert(lev+1 <= m->g->nplus);
636 m->lastpos[lev+1] = sp;
637 return(backref(m, sp, stop, ss+1, stopst, lev+1));
638 break;
639 case O_PLUS:
640 if (sp == m->lastpos[lev]) /* last pass matched null */
641 return(backref(m, sp, stop, ss+1, stopst, lev-1));
642 /* try another pass */
643 m->lastpos[lev] = sp;
644 dp = backref(m, sp, stop, ss-d+1, stopst, lev);
645 if (dp == NULL)
646 return(backref(m, sp, stop, ss+1, stopst, lev-1));
647 else
648 return(dp);
649 break;
650 case OCH_: /* find the right one, if any */
651 ssub = ss + 1;
652 esub = ss + d - 1;
653 assert(m->g->strip[esub] == OOR1);
654 for (;;) { /* find first matching branch */
655 dp = backref(m, sp, stop, ssub, esub, lev);
656 if (dp != NULL)
657 return(dp);
658 /* that one missed, try next one */
659 if (m->g->strip[esub] == O_CH)
660 return(NULL); /* there is none */
661 esub++;
662 assert(m->g->strip[esub] == OOR2);
663 ssub = esub + 1;
664 esub += m->g->stripdata[esub];
665 if (m->g->strip[esub] == OOR2)
666 esub--;
667 else
668 assert(m->g->strip[esub] == O_CH);
669 }
670 break;
671 case OLPAREN: /* must undo assignment if rest fails */
672 i = d;
673 assert(0 < i && i <= m->g->nsub);
674 offsave = m->pmatch[i].rm_so;
675 m->pmatch[i].rm_so = sp - m->offp;
676 dp = backref(m, sp, stop, ss+1, stopst, lev);
677 if (dp != NULL)
678 return(dp);
679 m->pmatch[i].rm_so = offsave;
680 return(NULL);
681 break;
682 case ORPAREN: /* must undo assignment if rest fails */
683 i = d;
684 assert(0 < i && i <= m->g->nsub);
685 offsave = m->pmatch[i].rm_eo;
686 m->pmatch[i].rm_eo = sp - m->offp;
687 dp = backref(m, sp, stop, ss+1, stopst, lev);
688 if (dp != NULL)
689 return(dp);
690 m->pmatch[i].rm_eo = offsave;
691 return(NULL);
692 break;
693 default: /* uh oh */
694 assert(nope);
695 break;
696 }
697
698 /* "can't happen" */
699 assert(nope);
700 /* NOTREACHED */
701 return NULL;
702 }
703
704 /*
705 - fast - step through the string at top speed
706 == static RCHAR_T *fast(struct match *m, RCHAR_T *start, \
707 == RCHAR_T *stop, sopno startst, sopno stopst);
708 */
709 static RCHAR_T * /* where tentative match ended, or NULL */
fast(m,start,stop,startst,stopst)710 fast(m, start, stop, startst, stopst)
711 struct match *m;
712 RCHAR_T *start;
713 RCHAR_T *stop;
714 sopno startst;
715 sopno stopst;
716 {
717 states st = m->st;
718 states fresh = m->fresh;
719 states tmp = m->tmp;
720 RCHAR_T *p = start;
721 RCHAR_T c = (start == m->beginp) ? OUT : *(start-1);
722 RCHAR_T lastc; /* previous c */
723 int flag;
724 int i;
725 RCHAR_T *coldp; /* last p after which no match was underway */
726
727 CLEAR(st);
728 SET1(st, startst);
729 st = step(m->g, startst, stopst, st, NOTHING, OUT, st);
730 ASSIGN(fresh, st);
731 SP("start", st, *p);
732 coldp = NULL;
733 for (;;) {
734 /* next character */
735 lastc = c;
736 c = (p == m->endp) ? OUT : *p;
737 if (EQ(st, fresh))
738 coldp = p;
739
740 /* is there an EOL and/or BOL between lastc and c? */
741 flag = 0;
742 i = 0;
743 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) ||
744 (lastc == OUT && !(m->eflags®_NOTBOL)) ) {
745 flag = BOL;
746 i = m->g->nbol;
747 }
748 if ( (c == '\n' && m->g->cflags®_NEWLINE) ||
749 (c == OUT && !(m->eflags®_NOTEOL)) ) {
750 flag = (flag == BOL) ? BOLEOL : EOL;
751 i += m->g->neol;
752 }
753 if (i != 0) {
754 for (; i > 0; i--)
755 st = step(m->g, startst, stopst, st, flag, OUT, st);
756 SP("boleol", st, c);
757 }
758
759 /* how about a word boundary? */
760 if ( (flag == BOL || (lastc != OUT && !ISWORD(lastc))) &&
761 (c != OUT && ISWORD(c)) ) {
762 flag = BOW;
763 }
764 if ( (lastc != OUT && ISWORD(lastc)) &&
765 (flag == EOL || (c != OUT && !ISWORD(c))) ) {
766 flag = EOW;
767 }
768 if (flag == BOW || flag == EOW) {
769 st = step(m->g, startst, stopst, st, flag, OUT, st);
770 SP("boweow", st, c);
771 }
772
773 /* are we done? */
774 if (ISSET(st, stopst) || p == stop)
775 break; /* NOTE BREAK OUT */
776
777 /* no, we must deal with this character */
778 ASSIGN(tmp, st);
779 ASSIGN(st, fresh);
780 assert(c != OUT);
781 st = step(m->g, startst, stopst, tmp, 0, c, st);
782 SP("aft", st, c);
783 assert(EQ(step(m->g, startst, stopst, st, NOTHING, OUT, st), st));
784 p++;
785 }
786
787 assert(coldp != NULL);
788 m->coldp = coldp;
789 if (ISSET(st, stopst))
790 return(p+1);
791 else
792 return(NULL);
793 }
794
795 /*
796 - slow - step through the string more deliberately
797 == static RCHAR_T *slow(struct match *m, RCHAR_T *start, \
798 == RCHAR_T *stop, sopno startst, sopno stopst);
799 */
800 static RCHAR_T * /* where it ended */
slow(m,start,stop,startst,stopst)801 slow(m, start, stop, startst, stopst)
802 struct match *m;
803 RCHAR_T *start;
804 RCHAR_T *stop;
805 sopno startst;
806 sopno stopst;
807 {
808 states st = m->st;
809 states empty = m->empty;
810 states tmp = m->tmp;
811 RCHAR_T *p = start;
812 RCHAR_T c = (start == m->beginp) ? OUT : *(start-1);
813 RCHAR_T lastc; /* previous c */
814 int flag;
815 int i;
816 RCHAR_T *matchp; /* last p at which a match ended */
817
818 AT("slow", start, stop, startst, stopst);
819 CLEAR(st);
820 SET1(st, startst);
821 SP("sstart", st, *p);
822 st = step(m->g, startst, stopst, st, NOTHING, OUT, st);
823 matchp = NULL;
824 for (;;) {
825 /* next character */
826 lastc = c;
827 c = (p == m->endp) ? OUT : *p;
828
829 /* is there an EOL and/or BOL between lastc and c? */
830 flag = 0;
831 i = 0;
832 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) ||
833 (lastc == OUT && !(m->eflags®_NOTBOL)) ) {
834 flag = BOL;
835 i = m->g->nbol;
836 }
837 if ( (c == '\n' && m->g->cflags®_NEWLINE) ||
838 (c == OUT && !(m->eflags®_NOTEOL)) ) {
839 flag = (flag == BOL) ? BOLEOL : EOL;
840 i += m->g->neol;
841 }
842 if (i != 0) {
843 for (; i > 0; i--)
844 st = step(m->g, startst, stopst, st, flag, OUT, st);
845 SP("sboleol", st, c);
846 }
847
848 /* how about a word boundary? */
849 if ( (flag == BOL || (lastc != OUT && !ISWORD(lastc))) &&
850 (c != OUT && ISWORD(c)) ) {
851 flag = BOW;
852 }
853 if ( (lastc != OUT && ISWORD(lastc)) &&
854 (flag == EOL || (c != OUT && !ISWORD(c))) ) {
855 flag = EOW;
856 }
857 if (flag == BOW || flag == EOW) {
858 st = step(m->g, startst, stopst, st, flag, OUT, st);
859 SP("sboweow", st, c);
860 }
861
862 /* are we done? */
863 if (ISSET(st, stopst))
864 matchp = p;
865 if (EQ(st, empty) || p == stop)
866 break; /* NOTE BREAK OUT */
867
868 /* no, we must deal with this character */
869 ASSIGN(tmp, st);
870 ASSIGN(st, empty);
871 assert(c != OUT);
872 st = step(m->g, startst, stopst, tmp, 0, c, st);
873 SP("saft", st, c);
874 assert(EQ(step(m->g, startst, stopst, st, NOTHING, OUT, st), st));
875 p++;
876 }
877
878 return(matchp);
879 }
880
881
882 /*
883 - step - map set of states reachable before char to set reachable after
884 == static states step(struct re_guts *g, sopno start, sopno stop, \
885 == states bef, int flag, RCHAR_T ch, states aft);
886 == #define BOL (1)
887 == #define EOL (BOL+1)
888 == #define BOLEOL (BOL+2)
889 == #define NOTHING (BOL+3)
890 == #define BOW (BOL+4)
891 == #define EOW (BOL+5)
892 */
893 static states
step(g,start,stop,bef,flag,ch,aft)894 step(g, start, stop, bef, flag, ch, aft)
895 struct re_guts *g;
896 sopno start; /* start state within strip */
897 sopno stop; /* state after stop state within strip */
898 states bef; /* states reachable before */
899 int flag; /* NONCHAR flag */
900 RCHAR_T ch; /* character code */
901 states aft; /* states already known reachable after */
902 {
903 cset *cs;
904 sop s;
905 RCHAR_T d;
906 sopno pc;
907 onestate here; /* note, macros know this name */
908 sopno look;
909 int i;
910
911 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) {
912 s = g->strip[pc];
913 d = g->stripdata[pc];
914 switch (s) {
915 case OEND:
916 assert(pc == stop-1);
917 break;
918 case OCHAR:
919 /* only characters can match */
920 assert(!flag || ch != d);
921 if (ch == d)
922 FWD(aft, bef, 1);
923 break;
924 case OBOL:
925 if (flag == BOL || flag == BOLEOL)
926 FWD(aft, bef, 1);
927 break;
928 case OEOL:
929 if (flag == EOL || flag == BOLEOL)
930 FWD(aft, bef, 1);
931 break;
932 case OBOW:
933 if (flag == BOW)
934 FWD(aft, bef, 1);
935 break;
936 case OEOW:
937 if (flag == EOW)
938 FWD(aft, bef, 1);
939 break;
940 case OANY:
941 if (!flag)
942 FWD(aft, bef, 1);
943 break;
944 case OANYOF:
945 cs = &g->sets[d];
946 if (!flag && CHIN(cs, ch))
947 FWD(aft, bef, 1);
948 break;
949 case OBACK_: /* ignored here */
950 case O_BACK:
951 FWD(aft, aft, 1);
952 break;
953 case OPLUS_: /* forward, this is just an empty */
954 FWD(aft, aft, 1);
955 break;
956 case O_PLUS: /* both forward and back */
957 FWD(aft, aft, 1);
958 i = ISSETBACK(aft, d);
959 BACK(aft, aft, d);
960 if (!i && ISSETBACK(aft, d)) {
961 /* oho, must reconsider loop body */
962 pc -= d + 1;
963 INIT(here, pc);
964 }
965 break;
966 case OQUEST_: /* two branches, both forward */
967 FWD(aft, aft, 1);
968 FWD(aft, aft, d);
969 break;
970 case O_QUEST: /* just an empty */
971 FWD(aft, aft, 1);
972 break;
973 case OLPAREN: /* not significant here */
974 case ORPAREN:
975 FWD(aft, aft, 1);
976 break;
977 case OCH_: /* mark the first two branches */
978 FWD(aft, aft, 1);
979 assert(OP(g->strip[pc+d]) == OOR2);
980 FWD(aft, aft, d);
981 break;
982 case OOR1: /* done a branch, find the O_CH */
983 if (ISSTATEIN(aft, here)) {
984 for (look = 1; /**/; look += d) {
985 s = g->strip[pc+look];
986 d = g->stripdata[pc+look];
987 if (s == O_CH)
988 break;
989 assert(s == OOR2);
990 }
991 FWD(aft, aft, look);
992 }
993 break;
994 case OOR2: /* propagate OCH_'s marking */
995 FWD(aft, aft, 1);
996 if (g->strip[pc+d] != O_CH) {
997 assert(g->strip[pc+d] == OOR2);
998 FWD(aft, aft, d);
999 }
1000 break;
1001 case O_CH: /* just empty */
1002 FWD(aft, aft, 1);
1003 break;
1004 default: /* ooooops... */
1005 assert(nope);
1006 break;
1007 }
1008 }
1009
1010 return(aft);
1011 }
1012
1013 #ifdef REDEBUG
1014 /*
1015 - print - print a set of states
1016 == #ifdef REDEBUG
1017 == static void print(struct match *m, char *caption, states st, \
1018 == int ch, FILE *d);
1019 == #endif
1020 */
1021 static void
print(m,caption,st,ch,d)1022 print(m, caption, st, ch, d)
1023 struct match *m;
1024 char *caption;
1025 states st;
1026 int ch;
1027 FILE *d;
1028 {
1029 struct re_guts *g = m->g;
1030 int i;
1031 int first = 1;
1032
1033 if (!(m->eflags®_TRACE))
1034 return;
1035
1036 fprintf(d, "%s", caption);
1037 if (ch != '\0')
1038 fprintf(d, " %s", pchar(ch));
1039 for (i = 0; i < g->nstates; i++)
1040 if (ISSET(st, i)) {
1041 fprintf(d, "%s%d", (first) ? "\t" : ", ", i);
1042 first = 0;
1043 }
1044 fprintf(d, "\n");
1045 }
1046
1047 /*
1048 - at - print current situation
1049 == #ifdef REDEBUG
1050 == static void at(struct match *m, char *title, char *start, char *stop, \
1051 == sopno startst, sopno stopst);
1052 == #endif
1053 */
1054 static void
at(m,title,start,stop,startst,stopst)1055 at(m, title, start, stop, startst, stopst)
1056 struct match *m;
1057 char *title;
1058 char *start;
1059 char *stop;
1060 sopno startst;
1061 sopno stopst;
1062 {
1063 if (!(m->eflags®_TRACE))
1064 return;
1065
1066 printf("%s %s-", title, pchar(*start));
1067 printf("%s ", pchar(*stop));
1068 printf("%ld-%ld\n", (long)startst, (long)stopst);
1069 }
1070
1071 #ifndef PCHARDONE
1072 #define PCHARDONE /* never again */
1073 /*
1074 - pchar - make a character printable
1075 == #ifdef REDEBUG
1076 == static char *pchar(int ch);
1077 == #endif
1078 *
1079 * Is this identical to regchar() over in debug.c? Well, yes. But a
1080 * duplicate here avoids having a debugging-capable regexec.o tied to
1081 * a matching debug.o, and this is convenient. It all disappears in
1082 * the non-debug compilation anyway, so it doesn't matter much.
1083 */
1084 static char * /* -> representation */
pchar(ch)1085 pchar(ch)
1086 int ch;
1087 {
1088 static char pbuf[10];
1089
1090 if (isprint(ch) || ch == ' ')
1091 sprintf(pbuf, "%c", ch);
1092 else
1093 sprintf(pbuf, "\\%o", ch);
1094 return(pbuf);
1095 }
1096 #endif
1097 #endif
1098
1099 #undef matcher
1100 #undef fast
1101 #undef slow
1102 #undef dissect
1103 #undef backref
1104 #undef step
1105 #undef print
1106 #undef at
1107 #undef match
1108