1 |
/**************************************************************** |
2 |
Copyright (C) Lucent Technologies 1997 |
3 |
All Rights Reserved |
4 |
|
5 |
Permission to use, copy, modify, and distribute this software and |
6 |
its documentation for any purpose and without fee is hereby |
7 |
granted, provided that the above copyright notice appear in all |
8 |
copies and that both that the copyright notice and this |
9 |
permission notice and warranty disclaimer appear in supporting |
10 |
documentation, and that the name Lucent Technologies or any of |
11 |
its entities not be used in advertising or publicity pertaining |
12 |
to distribution of the software without specific, written prior |
13 |
permission. |
14 |
|
15 |
LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, |
16 |
INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. |
17 |
IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY |
18 |
SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
19 |
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER |
20 |
IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, |
21 |
ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF |
22 |
THIS SOFTWARE. |
23 |
****************************************************************/ |
24 |
|
25 |
/* lasciate ogne speranza, voi ch'intrate. */ |
26 |
|
27 |
#include <sys/cdefs.h> |
28 |
__FBSDID("$FreeBSD: stable/10/contrib/one-true-awk/b.c 315582 2017-03-19 20:04:57Z pfg $"); |
29 |
|
30 |
#define DEBUG |
31 |
|
32 |
#include <ctype.h> |
33 |
#include <stdio.h> |
34 |
#include <string.h> |
35 |
#include <stdlib.h> |
36 |
#include "awk.h" |
37 |
#include "ytab.h" |
38 |
|
39 |
#define HAT (NCHARS+2) /* matches ^ in regular expr */ |
40 |
/* NCHARS is 2**n */ |
41 |
#define MAXLIN 22 |
42 |
|
43 |
#define type(v) (v)->nobj /* badly overloaded here */ |
44 |
#define info(v) (v)->ntype /* badly overloaded here */ |
45 |
#define left(v) (v)->narg[0] |
46 |
#define right(v) (v)->narg[1] |
47 |
#define parent(v) (v)->nnext |
48 |
|
49 |
#define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL: |
50 |
#define ELEAF case EMPTYRE: /* empty string in regexp */ |
51 |
#define UNARY case STAR: case PLUS: case QUEST: |
52 |
|
53 |
/* encoding in tree Nodes: |
54 |
leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE): |
55 |
left is index, right contains value or pointer to value |
56 |
unary (STAR, PLUS, QUEST): left is child, right is null |
57 |
binary (CAT, OR): left and right are children |
58 |
parent contains pointer to parent |
59 |
*/ |
60 |
|
61 |
|
62 |
int *setvec; |
63 |
int *tmpset; |
64 |
int maxsetvec = 0; |
65 |
|
66 |
int rtok; /* next token in current re */ |
67 |
int rlxval; |
68 |
static uschar *rlxstr; |
69 |
static uschar *prestr; /* current position in current re */ |
70 |
static uschar *lastre; /* origin of last re */ |
71 |
|
72 |
static int setcnt; |
73 |
static int poscnt; |
74 |
|
75 |
char *patbeg; |
76 |
int patlen; |
77 |
|
78 |
#define NFA 20 /* cache this many dynamic fa's */ |
79 |
fa *fatab[NFA]; |
80 |
int nfatab = 0; /* entries in fatab */ |
81 |
|
82 |
fa *makedfa(const char *s, int anchor) /* returns dfa for reg expr s */ |
83 |
{ |
84 |
int i, use, nuse; |
85 |
fa *pfa; |
86 |
static int now = 1; |
87 |
|
88 |
if (setvec == 0) { /* first time through any RE */ |
89 |
maxsetvec = MAXLIN; |
90 |
setvec = (int *) malloc(maxsetvec * sizeof(int)); |
91 |
tmpset = (int *) malloc(maxsetvec * sizeof(int)); |
92 |
if (setvec == 0 || tmpset == 0) |
93 |
overflo("out of space initializing makedfa"); |
94 |
} |
95 |
|
96 |
if (compile_time) /* a constant for sure */ |
97 |
return mkdfa(s, anchor); |
98 |
for (i = 0; i < nfatab; i++) /* is it there already? */ |
99 |
if (fatab[i]->anchor == anchor |
100 |
&& strcmp((const char *) fatab[i]->restr, s) == 0) { |
101 |
fatab[i]->use = now++; |
102 |
return fatab[i]; |
103 |
} |
104 |
pfa = mkdfa(s, anchor); |
105 |
if (nfatab < NFA) { /* room for another */ |
106 |
fatab[nfatab] = pfa; |
107 |
fatab[nfatab]->use = now++; |
108 |
nfatab++; |
109 |
return pfa; |
110 |
} |
111 |
use = fatab[0]->use; /* replace least-recently used */ |
112 |
nuse = 0; |
113 |
for (i = 1; i < nfatab; i++) |
114 |
if (fatab[i]->use < use) { |
115 |
use = fatab[i]->use; |
116 |
nuse = i; |
117 |
} |
118 |
freefa(fatab[nuse]); |
119 |
fatab[nuse] = pfa; |
120 |
pfa->use = now++; |
121 |
return pfa; |
122 |
} |
123 |
|
124 |
fa *mkdfa(const char *s, int anchor) /* does the real work of making a dfa */ |
125 |
/* anchor = 1 for anchored matches, else 0 */ |
126 |
{ |
127 |
Node *p, *p1; |
128 |
fa *f; |
129 |
|
130 |
p = reparse(s); |
131 |
p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p); |
132 |
/* put ALL STAR in front of reg. exp. */ |
133 |
p1 = op2(CAT, p1, op2(FINAL, NIL, NIL)); |
134 |
/* put FINAL after reg. exp. */ |
135 |
|
136 |
poscnt = 0; |
137 |
penter(p1); /* enter parent pointers and leaf indices */ |
138 |
if ((f = (fa *) calloc(1, sizeof(fa) + poscnt*sizeof(rrow))) == NULL) |
139 |
overflo("out of space for fa"); |
140 |
f->accept = poscnt-1; /* penter has computed number of positions in re */ |
141 |
cfoll(f, p1); /* set up follow sets */ |
142 |
freetr(p1); |
143 |
if ((f->posns[0] = (int *) calloc(*(f->re[0].lfollow), sizeof(int))) == NULL) |
144 |
overflo("out of space in makedfa"); |
145 |
if ((f->posns[1] = (int *) calloc(1, sizeof(int))) == NULL) |
146 |
overflo("out of space in makedfa"); |
147 |
*f->posns[1] = 0; |
148 |
f->initstat = makeinit(f, anchor); |
149 |
f->anchor = anchor; |
150 |
f->restr = (uschar *) tostring(s); |
151 |
return f; |
152 |
} |
153 |
|
154 |
int makeinit(fa *f, int anchor) |
155 |
{ |
156 |
int i, k; |
157 |
|
158 |
f->curstat = 2; |
159 |
f->out[2] = 0; |
160 |
f->reset = 0; |
161 |
k = *(f->re[0].lfollow); |
162 |
xfree(f->posns[2]); |
163 |
if ((f->posns[2] = (int *) calloc(k+1, sizeof(int))) == NULL) |
164 |
overflo("out of space in makeinit"); |
165 |
for (i=0; i <= k; i++) { |
166 |
(f->posns[2])[i] = (f->re[0].lfollow)[i]; |
167 |
} |
168 |
if ((f->posns[2])[1] == f->accept) |
169 |
f->out[2] = 1; |
170 |
for (i=0; i < NCHARS; i++) |
171 |
f->gototab[2][i] = 0; |
172 |
f->curstat = cgoto(f, 2, HAT); |
173 |
if (anchor) { |
174 |
*f->posns[2] = k-1; /* leave out position 0 */ |
175 |
for (i=0; i < k; i++) { |
176 |
(f->posns[0])[i] = (f->posns[2])[i]; |
177 |
} |
178 |
|
179 |
f->out[0] = f->out[2]; |
180 |
if (f->curstat != 2) |
181 |
--(*f->posns[f->curstat]); |
182 |
} |
183 |
return f->curstat; |
184 |
} |
185 |
|
186 |
void penter(Node *p) /* set up parent pointers and leaf indices */ |
187 |
{ |
188 |
switch (type(p)) { |
189 |
ELEAF |
190 |
LEAF |
191 |
info(p) = poscnt; |
192 |
poscnt++; |
193 |
break; |
194 |
UNARY |
195 |
penter(left(p)); |
196 |
parent(left(p)) = p; |
197 |
break; |
198 |
case CAT: |
199 |
case OR: |
200 |
penter(left(p)); |
201 |
penter(right(p)); |
202 |
parent(left(p)) = p; |
203 |
parent(right(p)) = p; |
204 |
break; |
205 |
default: /* can't happen */ |
206 |
FATAL("can't happen: unknown type %d in penter", type(p)); |
207 |
break; |
208 |
} |
209 |
} |
210 |
|
211 |
void freetr(Node *p) /* free parse tree */ |
212 |
{ |
213 |
switch (type(p)) { |
214 |
ELEAF |
215 |
LEAF |
216 |
xfree(p); |
217 |
break; |
218 |
UNARY |
219 |
freetr(left(p)); |
220 |
xfree(p); |
221 |
break; |
222 |
case CAT: |
223 |
case OR: |
224 |
freetr(left(p)); |
225 |
freetr(right(p)); |
226 |
xfree(p); |
227 |
break; |
228 |
default: /* can't happen */ |
229 |
FATAL("can't happen: unknown type %d in freetr", type(p)); |
230 |
break; |
231 |
} |
232 |
} |
233 |
|
234 |
/* in the parsing of regular expressions, metacharacters like . have */ |
235 |
/* to be seen literally; \056 is not a metacharacter. */ |
236 |
|
237 |
int hexstr(uschar **pp) /* find and eval hex string at pp, return new p */ |
238 |
{ /* only pick up one 8-bit byte (2 chars) */ |
239 |
uschar *p; |
240 |
int n = 0; |
241 |
int i; |
242 |
|
243 |
for (i = 0, p = (uschar *) *pp; i < 2 && isxdigit(*p); i++, p++) { |
244 |
if (isdigit(*p)) |
245 |
n = 16 * n + *p - '0'; |
246 |
else if (*p >= 'a' && *p <= 'f') |
247 |
n = 16 * n + *p - 'a' + 10; |
248 |
else if (*p >= 'A' && *p <= 'F') |
249 |
n = 16 * n + *p - 'A' + 10; |
250 |
} |
251 |
*pp = (uschar *) p; |
252 |
return n; |
253 |
} |
254 |
|
255 |
#define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */ |
256 |
|
257 |
int quoted(uschar **pp) /* pick up next thing after a \\ */ |
258 |
/* and increment *pp */ |
259 |
{ |
260 |
uschar *p = *pp; |
261 |
int c; |
262 |
|
263 |
if ((c = *p++) == 't') |
264 |
c = '\t'; |
265 |
else if (c == 'n') |
266 |
c = '\n'; |
267 |
else if (c == 'f') |
268 |
c = '\f'; |
269 |
else if (c == 'r') |
270 |
c = '\r'; |
271 |
else if (c == 'b') |
272 |
c = '\b'; |
273 |
else if (c == '\\') |
274 |
c = '\\'; |
275 |
else if (c == 'x') { /* hexadecimal goo follows */ |
276 |
c = hexstr(&p); /* this adds a null if number is invalid */ |
277 |
} else if (isoctdigit(c)) { /* \d \dd \ddd */ |
278 |
int n = c - '0'; |
279 |
if (isoctdigit(*p)) { |
280 |
n = 8 * n + *p++ - '0'; |
281 |
if (isoctdigit(*p)) |
282 |
n = 8 * n + *p++ - '0'; |
283 |
} |
284 |
c = n; |
285 |
} /* else */ |
286 |
/* c = c; */ |
287 |
*pp = p; |
288 |
return c; |
289 |
} |
290 |
|
291 |
static int collate_range_cmp(int a, int b) |
292 |
{ |
293 |
static char s[2][2]; |
294 |
|
295 |
if ((uschar)a == (uschar)b) |
296 |
return 0; |
297 |
s[0][0] = a; |
298 |
s[1][0] = b; |
299 |
return (strcoll(s[0], s[1])); |
300 |
} |
301 |
|
302 |
char *cclenter(const char *argp) /* add a character class */ |
303 |
{ |
304 |
int i, c, c2; |
305 |
int j; |
306 |
uschar *p = (uschar *) argp; |
307 |
uschar *op, *bp; |
308 |
static uschar *buf = 0; |
309 |
static int bufsz = 100; |
310 |
|
311 |
op = p; |
312 |
if (buf == 0 && (buf = (uschar *) malloc(bufsz)) == NULL) |
313 |
FATAL("out of space for character class [%.10s...] 1", p); |
314 |
bp = buf; |
315 |
for (i = 0; (c = *p++) != 0; ) { |
316 |
if (c == '\\') { |
317 |
c = quoted(&p); |
318 |
} else if (c == '-' && i > 0 && bp[-1] != 0) { |
319 |
if (*p != 0) { |
320 |
c = bp[-1]; |
321 |
c2 = *p++; |
322 |
if (c2 == '\\') |
323 |
c2 = quoted(&p); |
324 |
if (collate_range_cmp(c, c2) > 0) { |
325 |
bp--; |
326 |
i--; |
327 |
continue; |
328 |
} |
329 |
for (j = 0; j < NCHARS; j++) { |
330 |
if ((collate_range_cmp(c, j) > 0) || |
331 |
collate_range_cmp(j, c2) > 0) |
332 |
continue; |
333 |
if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter1")) |
334 |
FATAL("out of space for character class [%.10s...] 2", p); |
335 |
*bp++ = j; |
336 |
i++; |
337 |
} |
338 |
continue; |
339 |
} |
340 |
} |
341 |
if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter2")) |
342 |
FATAL("out of space for character class [%.10s...] 3", p); |
343 |
*bp++ = c; |
344 |
i++; |
345 |
} |
346 |
*bp = 0; |
347 |
dprintf( ("cclenter: in = |%s|, out = |%s|\n", op, buf) ); |
348 |
xfree(op); |
349 |
return (char *) tostring((char *) buf); |
350 |
} |
351 |
|
352 |
void overflo(const char *s) |
353 |
{ |
354 |
FATAL("regular expression too big: %.30s...", s); |
355 |
} |
356 |
|
357 |
void cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */ |
358 |
{ |
359 |
int i; |
360 |
int *p; |
361 |
|
362 |
switch (type(v)) { |
363 |
ELEAF |
364 |
LEAF |
365 |
f->re[info(v)].ltype = type(v); |
366 |
f->re[info(v)].lval.np = right(v); |
367 |
while (f->accept >= maxsetvec) { /* guessing here! */ |
368 |
maxsetvec *= 4; |
369 |
setvec = (int *) realloc(setvec, maxsetvec * sizeof(int)); |
370 |
tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int)); |
371 |
if (setvec == 0 || tmpset == 0) |
372 |
overflo("out of space in cfoll()"); |
373 |
} |
374 |
for (i = 0; i <= f->accept; i++) |
375 |
setvec[i] = 0; |
376 |
setcnt = 0; |
377 |
follow(v); /* computes setvec and setcnt */ |
378 |
if ((p = (int *) calloc(setcnt+1, sizeof(int))) == NULL) |
379 |
overflo("out of space building follow set"); |
380 |
f->re[info(v)].lfollow = p; |
381 |
*p = setcnt; |
382 |
for (i = f->accept; i >= 0; i--) |
383 |
if (setvec[i] == 1) |
384 |
*++p = i; |
385 |
break; |
386 |
UNARY |
387 |
cfoll(f,left(v)); |
388 |
break; |
389 |
case CAT: |
390 |
case OR: |
391 |
cfoll(f,left(v)); |
392 |
cfoll(f,right(v)); |
393 |
break; |
394 |
default: /* can't happen */ |
395 |
FATAL("can't happen: unknown type %d in cfoll", type(v)); |
396 |
} |
397 |
} |
398 |
|
399 |
int first(Node *p) /* collects initially active leaves of p into setvec */ |
400 |
/* returns 0 if p matches empty string */ |
401 |
{ |
402 |
int b, lp; |
403 |
|
404 |
switch (type(p)) { |
405 |
ELEAF |
406 |
LEAF |
407 |
lp = info(p); /* look for high-water mark of subscripts */ |
408 |
while (setcnt >= maxsetvec || lp >= maxsetvec) { /* guessing here! */ |
409 |
maxsetvec *= 4; |
410 |
setvec = (int *) realloc(setvec, maxsetvec * sizeof(int)); |
411 |
tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int)); |
412 |
if (setvec == 0 || tmpset == 0) |
413 |
overflo("out of space in first()"); |
414 |
} |
415 |
if (type(p) == EMPTYRE) { |
416 |
setvec[lp] = 0; |
417 |
return(0); |
418 |
} |
419 |
if (setvec[lp] != 1) { |
420 |
setvec[lp] = 1; |
421 |
setcnt++; |
422 |
} |
423 |
if (type(p) == CCL && (*(char *) right(p)) == '\0') |
424 |
return(0); /* empty CCL */ |
425 |
else return(1); |
426 |
case PLUS: |
427 |
if (first(left(p)) == 0) return(0); |
428 |
return(1); |
429 |
case STAR: |
430 |
case QUEST: |
431 |
first(left(p)); |
432 |
return(0); |
433 |
case CAT: |
434 |
if (first(left(p)) == 0 && first(right(p)) == 0) return(0); |
435 |
return(1); |
436 |
case OR: |
437 |
b = first(right(p)); |
438 |
if (first(left(p)) == 0 || b == 0) return(0); |
439 |
return(1); |
440 |
} |
441 |
FATAL("can't happen: unknown type %d in first", type(p)); /* can't happen */ |
442 |
return(-1); |
443 |
} |
444 |
|
445 |
void follow(Node *v) /* collects leaves that can follow v into setvec */ |
446 |
{ |
447 |
Node *p; |
448 |
|
449 |
if (type(v) == FINAL) |
450 |
return; |
451 |
p = parent(v); |
452 |
switch (type(p)) { |
453 |
case STAR: |
454 |
case PLUS: |
455 |
first(v); |
456 |
follow(p); |
457 |
return; |
458 |
|
459 |
case OR: |
460 |
case QUEST: |
461 |
follow(p); |
462 |
return; |
463 |
|
464 |
case CAT: |
465 |
if (v == left(p)) { /* v is left child of p */ |
466 |
if (first(right(p)) == 0) { |
467 |
follow(p); |
468 |
return; |
469 |
} |
470 |
} else /* v is right child */ |
471 |
follow(p); |
472 |
return; |
473 |
} |
474 |
} |
475 |
|
476 |
int member(int c, const char *sarg) /* is c in s? */ |
477 |
{ |
478 |
uschar *s = (uschar *) sarg; |
479 |
|
480 |
while (*s) |
481 |
if (c == *s++) |
482 |
return(1); |
483 |
return(0); |
484 |
} |
485 |
|
486 |
int match(fa *f, const char *p0) /* shortest match ? */ |
487 |
{ |
488 |
int s, ns; |
489 |
uschar *p = (uschar *) p0; |
490 |
|
491 |
s = f->reset ? makeinit(f,0) : f->initstat; |
492 |
if (f->out[s]) |
493 |
return(1); |
494 |
do { |
495 |
/* assert(*p < NCHARS); */ |
496 |
if ((ns = f->gototab[s][*p]) != 0) |
497 |
s = ns; |
498 |
else |
499 |
s = cgoto(f, s, *p); |
500 |
if (f->out[s]) |
501 |
return(1); |
502 |
} while (*p++ != 0); |
503 |
return(0); |
504 |
} |
505 |
|
506 |
int pmatch(fa *f, const char *p0) /* longest match, for sub */ |
507 |
{ |
508 |
int s, ns; |
509 |
uschar *p = (uschar *) p0; |
510 |
uschar *q; |
511 |
int i, k; |
512 |
|
513 |
/* s = f->reset ? makeinit(f,1) : f->initstat; */ |
514 |
if (f->reset) { |
515 |
f->initstat = s = makeinit(f,1); |
516 |
} else { |
517 |
s = f->initstat; |
518 |
} |
519 |
patbeg = (char *) p; |
520 |
patlen = -1; |
521 |
do { |
522 |
q = p; |
523 |
do { |
524 |
if (f->out[s]) /* final state */ |
525 |
patlen = q-p; |
526 |
/* assert(*q < NCHARS); */ |
527 |
if ((ns = f->gototab[s][*q]) != 0) |
528 |
s = ns; |
529 |
else |
530 |
s = cgoto(f, s, *q); |
531 |
if (s == 1) { /* no transition */ |
532 |
if (patlen >= 0) { |
533 |
patbeg = (char *) p; |
534 |
return(1); |
535 |
} |
536 |
else |
537 |
goto nextin; /* no match */ |
538 |
} |
539 |
} while (*q++ != 0); |
540 |
if (f->out[s]) |
541 |
patlen = q-p-1; /* don't count $ */ |
542 |
if (patlen >= 0) { |
543 |
patbeg = (char *) p; |
544 |
return(1); |
545 |
} |
546 |
nextin: |
547 |
s = 2; |
548 |
if (f->reset) { |
549 |
for (i = 2; i <= f->curstat; i++) |
550 |
xfree(f->posns[i]); |
551 |
k = *f->posns[0]; |
552 |
if ((f->posns[2] = (int *) calloc(k+1, sizeof(int))) == NULL) |
553 |
overflo("out of space in pmatch"); |
554 |
for (i = 0; i <= k; i++) |
555 |
(f->posns[2])[i] = (f->posns[0])[i]; |
556 |
f->initstat = f->curstat = 2; |
557 |
f->out[2] = f->out[0]; |
558 |
for (i = 0; i < NCHARS; i++) |
559 |
f->gototab[2][i] = 0; |
560 |
} |
561 |
} while (*p++ != 0); |
562 |
return (0); |
563 |
} |
564 |
|
565 |
int nematch(fa *f, const char *p0) /* non-empty match, for sub */ |
566 |
{ |
567 |
int s, ns; |
568 |
uschar *p = (uschar *) p0; |
569 |
uschar *q; |
570 |
int i, k; |
571 |
|
572 |
/* s = f->reset ? makeinit(f,1) : f->initstat; */ |
573 |
if (f->reset) { |
574 |
f->initstat = s = makeinit(f,1); |
575 |
} else { |
576 |
s = f->initstat; |
577 |
} |
578 |
patlen = -1; |
579 |
while (*p) { |
580 |
q = p; |
581 |
do { |
582 |
if (f->out[s]) /* final state */ |
583 |
patlen = q-p; |
584 |
/* assert(*q < NCHARS); */ |
585 |
if ((ns = f->gototab[s][*q]) != 0) |
586 |
s = ns; |
587 |
else |
588 |
s = cgoto(f, s, *q); |
589 |
if (s == 1) { /* no transition */ |
590 |
if (patlen > 0) { |
591 |
patbeg = (char *) p; |
592 |
return(1); |
593 |
} else |
594 |
goto nnextin; /* no nonempty match */ |
595 |
} |
596 |
} while (*q++ != 0); |
597 |
if (f->out[s]) |
598 |
patlen = q-p-1; /* don't count $ */ |
599 |
if (patlen > 0 ) { |
600 |
patbeg = (char *) p; |
601 |
return(1); |
602 |
} |
603 |
nnextin: |
604 |
s = 2; |
605 |
if (f->reset) { |
606 |
for (i = 2; i <= f->curstat; i++) |
607 |
xfree(f->posns[i]); |
608 |
k = *f->posns[0]; |
609 |
if ((f->posns[2] = (int *) calloc(k+1, sizeof(int))) == NULL) |
610 |
overflo("out of state space"); |
611 |
for (i = 0; i <= k; i++) |
612 |
(f->posns[2])[i] = (f->posns[0])[i]; |
613 |
f->initstat = f->curstat = 2; |
614 |
f->out[2] = f->out[0]; |
615 |
for (i = 0; i < NCHARS; i++) |
616 |
f->gototab[2][i] = 0; |
617 |
} |
618 |
p++; |
619 |
} |
620 |
return (0); |
621 |
} |
622 |
|
623 |
Node *reparse(const char *p) /* parses regular expression pointed to by p */ |
624 |
{ /* uses relex() to scan regular expression */ |
625 |
Node *np; |
626 |
|
627 |
dprintf( ("reparse <%s>\n", p) ); |
628 |
lastre = prestr = (uschar *) p; /* prestr points to string to be parsed */ |
629 |
rtok = relex(); |
630 |
/* GNU compatibility: an empty regexp matches anything */ |
631 |
if (rtok == '\0') { |
632 |
/* FATAL("empty regular expression"); previous */ |
633 |
return(op2(EMPTYRE, NIL, NIL)); |
634 |
} |
635 |
np = regexp(); |
636 |
if (rtok != '\0') |
637 |
FATAL("syntax error in regular expression %s at %s", lastre, prestr); |
638 |
return(np); |
639 |
} |
640 |
|
641 |
Node *regexp(void) /* top-level parse of reg expr */ |
642 |
{ |
643 |
return (alt(concat(primary()))); |
644 |
} |
645 |
|
646 |
Node *primary(void) |
647 |
{ |
648 |
Node *np; |
649 |
|
650 |
switch (rtok) { |
651 |
case CHAR: |
652 |
np = op2(CHAR, NIL, itonp(rlxval)); |
653 |
rtok = relex(); |
654 |
return (unary(np)); |
655 |
case ALL: |
656 |
rtok = relex(); |
657 |
return (unary(op2(ALL, NIL, NIL))); |
658 |
case EMPTYRE: |
659 |
rtok = relex(); |
660 |
return (unary(op2(ALL, NIL, NIL))); |
661 |
case DOT: |
662 |
rtok = relex(); |
663 |
return (unary(op2(DOT, NIL, NIL))); |
664 |
case CCL: |
665 |
np = op2(CCL, NIL, (Node*) cclenter((char *) rlxstr)); |
666 |
rtok = relex(); |
667 |
return (unary(np)); |
668 |
case NCCL: |
669 |
np = op2(NCCL, NIL, (Node *) cclenter((char *) rlxstr)); |
670 |
rtok = relex(); |
671 |
return (unary(np)); |
672 |
case '^': |
673 |
rtok = relex(); |
674 |
return (unary(op2(CHAR, NIL, itonp(HAT)))); |
675 |
case '$': |
676 |
rtok = relex(); |
677 |
return (unary(op2(CHAR, NIL, NIL))); |
678 |
case '(': |
679 |
rtok = relex(); |
680 |
if (rtok == ')') { /* special pleading for () */ |
681 |
rtok = relex(); |
682 |
return unary(op2(CCL, NIL, (Node *) tostring(""))); |
683 |
} |
684 |
np = regexp(); |
685 |
if (rtok == ')') { |
686 |
rtok = relex(); |
687 |
return (unary(np)); |
688 |
} |
689 |
else |
690 |
FATAL("syntax error in regular expression %s at %s", lastre, prestr); |
691 |
default: |
692 |
FATAL("illegal primary in regular expression %s at %s", lastre, prestr); |
693 |
} |
694 |
return 0; /*NOTREACHED*/ |
695 |
} |
696 |
|
697 |
Node *concat(Node *np) |
698 |
{ |
699 |
switch (rtok) { |
700 |
case CHAR: case DOT: case ALL: case EMPTYRE: case CCL: case NCCL: case '$': case '(': |
701 |
return (concat(op2(CAT, np, primary()))); |
702 |
} |
703 |
return (np); |
704 |
} |
705 |
|
706 |
Node *alt(Node *np) |
707 |
{ |
708 |
if (rtok == OR) { |
709 |
rtok = relex(); |
710 |
return (alt(op2(OR, np, concat(primary())))); |
711 |
} |
712 |
return (np); |
713 |
} |
714 |
|
715 |
Node *unary(Node *np) |
716 |
{ |
717 |
switch (rtok) { |
718 |
case STAR: |
719 |
rtok = relex(); |
720 |
return (unary(op2(STAR, np, NIL))); |
721 |
case PLUS: |
722 |
rtok = relex(); |
723 |
return (unary(op2(PLUS, np, NIL))); |
724 |
case QUEST: |
725 |
rtok = relex(); |
726 |
return (unary(op2(QUEST, np, NIL))); |
727 |
default: |
728 |
return (np); |
729 |
} |
730 |
} |
731 |
|
732 |
/* |
733 |
* Character class definitions conformant to the POSIX locale as |
734 |
* defined in IEEE P1003.1 draft 7 of June 2001, assuming the source |
735 |
* and operating character sets are both ASCII (ISO646) or supersets |
736 |
* thereof. |
737 |
* |
738 |
* Note that to avoid overflowing the temporary buffer used in |
739 |
* relex(), the expanded character class (prior to range expansion) |
740 |
* must be less than twice the size of their full name. |
741 |
*/ |
742 |
|
743 |
/* Because isblank doesn't show up in any of the header files on any |
744 |
* system i use, it's defined here. if some other locale has a richer |
745 |
* definition of "blank", define HAS_ISBLANK and provide your own |
746 |
* version. |
747 |
* the parentheses here are an attempt to find a path through the maze |
748 |
* of macro definition and/or function and/or version provided. thanks |
749 |
* to nelson beebe for the suggestion; let's see if it works everywhere. |
750 |
*/ |
751 |
|
752 |
/* #define HAS_ISBLANK */ |
753 |
#ifndef HAS_ISBLANK |
754 |
|
755 |
int (xisblank)(int c) |
756 |
{ |
757 |
return c==' ' || c=='\t'; |
758 |
} |
759 |
|
760 |
#endif |
761 |
|
762 |
struct charclass { |
763 |
const char *cc_name; |
764 |
int cc_namelen; |
765 |
int (*cc_func)(int); |
766 |
} charclasses[] = { |
767 |
{ "alnum", 5, isalnum }, |
768 |
{ "alpha", 5, isalpha }, |
769 |
#ifndef HAS_ISBLANK |
770 |
{ "blank", 5, isspace }, /* was isblank */ |
771 |
#else |
772 |
{ "blank", 5, isblank }, |
773 |
#endif |
774 |
{ "cntrl", 5, iscntrl }, |
775 |
{ "digit", 5, isdigit }, |
776 |
{ "graph", 5, isgraph }, |
777 |
{ "lower", 5, islower }, |
778 |
{ "print", 5, isprint }, |
779 |
{ "punct", 5, ispunct }, |
780 |
{ "space", 5, isspace }, |
781 |
{ "upper", 5, isupper }, |
782 |
{ "xdigit", 6, isxdigit }, |
783 |
{ NULL, 0, NULL }, |
784 |
}; |
785 |
|
786 |
|
787 |
int relex(void) /* lexical analyzer for reparse */ |
788 |
{ |
789 |
int c, n; |
790 |
int cflag; |
791 |
static uschar *buf = 0; |
792 |
static int bufsz = 100; |
793 |
uschar *bp; |
794 |
struct charclass *cc; |
795 |
int i; |
796 |
|
797 |
switch (c = *prestr++) { |
798 |
case '|': return OR; |
799 |
case '*': return STAR; |
800 |
case '+': return PLUS; |
801 |
case '?': return QUEST; |
802 |
case '.': return DOT; |
803 |
case '\0': prestr--; return '\0'; |
804 |
case '^': |
805 |
case '$': |
806 |
case '(': |
807 |
case ')': |
808 |
return c; |
809 |
case '\\': |
810 |
rlxval = quoted(&prestr); |
811 |
return CHAR; |
812 |
default: |
813 |
rlxval = c; |
814 |
return CHAR; |
815 |
case '[': |
816 |
if (buf == 0 && (buf = (uschar *) malloc(bufsz)) == NULL) |
817 |
FATAL("out of space in reg expr %.10s..", lastre); |
818 |
bp = buf; |
819 |
if (*prestr == '^') { |
820 |
cflag = 1; |
821 |
prestr++; |
822 |
} |
823 |
else |
824 |
cflag = 0; |
825 |
n = 2 * strlen((const char *) prestr)+1; |
826 |
if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1")) |
827 |
FATAL("out of space for reg expr %.10s...", lastre); |
828 |
for (; ; ) { |
829 |
if ((c = *prestr++) == '\\') { |
830 |
*bp++ = '\\'; |
831 |
if ((c = *prestr++) == '\0') |
832 |
FATAL("nonterminated character class %.20s...", lastre); |
833 |
*bp++ = c; |
834 |
/* } else if (c == '\n') { */ |
835 |
/* FATAL("newline in character class %.20s...", lastre); */ |
836 |
} else if (c == '[' && *prestr == ':') { |
837 |
/* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */ |
838 |
for (cc = charclasses; cc->cc_name; cc++) |
839 |
if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0) |
840 |
break; |
841 |
if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' && |
842 |
prestr[2 + cc->cc_namelen] == ']') { |
843 |
prestr += cc->cc_namelen + 3; |
844 |
for (i = 1; i < NCHARS; i++) { |
845 |
if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, "relex2")) |
846 |
FATAL("out of space for reg expr %.10s...", lastre); |
847 |
if (cc->cc_func(i)) { |
848 |
*bp++ = i; |
849 |
n++; |
850 |
} |
851 |
} |
852 |
} else |
853 |
*bp++ = c; |
854 |
} else if (c == '\0') { |
855 |
FATAL("nonterminated character class %.20s", lastre); |
856 |
} else if (bp == buf) { /* 1st char is special */ |
857 |
*bp++ = c; |
858 |
} else if (c == ']') { |
859 |
*bp++ = 0; |
860 |
rlxstr = (uschar *) tostring((char *) buf); |
861 |
if (cflag == 0) |
862 |
return CCL; |
863 |
else |
864 |
return NCCL; |
865 |
} else |
866 |
*bp++ = c; |
867 |
} |
868 |
} |
869 |
} |
870 |
|
871 |
int cgoto(fa *f, int s, int c) |
872 |
{ |
873 |
int i, j, k; |
874 |
int *p, *q; |
875 |
|
876 |
assert(c == HAT || c < NCHARS); |
877 |
while (f->accept >= maxsetvec) { /* guessing here! */ |
878 |
maxsetvec *= 4; |
879 |
setvec = (int *) realloc(setvec, maxsetvec * sizeof(int)); |
880 |
tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int)); |
881 |
if (setvec == 0 || tmpset == 0) |
882 |
overflo("out of space in cgoto()"); |
883 |
} |
884 |
for (i = 0; i <= f->accept; i++) |
885 |
setvec[i] = 0; |
886 |
setcnt = 0; |
887 |
/* compute positions of gototab[s,c] into setvec */ |
888 |
p = f->posns[s]; |
889 |
for (i = 1; i <= *p; i++) { |
890 |
if ((k = f->re[p[i]].ltype) != FINAL) { |
891 |
if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np)) |
892 |
|| (k == DOT && c != 0 && c != HAT) |
893 |
|| (k == ALL && c != 0) |
894 |
|| (k == EMPTYRE && c != 0) |
895 |
|| (k == CCL && member(c, (char *) f->re[p[i]].lval.up)) |
896 |
|| (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) { |
897 |
q = f->re[p[i]].lfollow; |
898 |
for (j = 1; j <= *q; j++) { |
899 |
if (q[j] >= maxsetvec) { |
900 |
maxsetvec *= 4; |
901 |
setvec = (int *) realloc(setvec, maxsetvec * sizeof(int)); |
902 |
tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int)); |
903 |
if (setvec == 0 || tmpset == 0) |
904 |
overflo("cgoto overflow"); |
905 |
} |
906 |
if (setvec[q[j]] == 0) { |
907 |
setcnt++; |
908 |
setvec[q[j]] = 1; |
909 |
} |
910 |
} |
911 |
} |
912 |
} |
913 |
} |
914 |
/* determine if setvec is a previous state */ |
915 |
tmpset[0] = setcnt; |
916 |
j = 1; |
917 |
for (i = f->accept; i >= 0; i--) |
918 |
if (setvec[i]) { |
919 |
tmpset[j++] = i; |
920 |
} |
921 |
/* tmpset == previous state? */ |
922 |
for (i = 1; i <= f->curstat; i++) { |
923 |
p = f->posns[i]; |
924 |
if ((k = tmpset[0]) != p[0]) |
925 |
goto different; |
926 |
for (j = 1; j <= k; j++) |
927 |
if (tmpset[j] != p[j]) |
928 |
goto different; |
929 |
/* setvec is state i */ |
930 |
f->gototab[s][c] = i; |
931 |
return i; |
932 |
different:; |
933 |
} |
934 |
|
935 |
/* add tmpset to current set of states */ |
936 |
if (f->curstat >= NSTATES-1) { |
937 |
f->curstat = 2; |
938 |
f->reset = 1; |
939 |
for (i = 2; i < NSTATES; i++) |
940 |
xfree(f->posns[i]); |
941 |
} else |
942 |
++(f->curstat); |
943 |
for (i = 0; i < NCHARS; i++) |
944 |
f->gototab[f->curstat][i] = 0; |
945 |
xfree(f->posns[f->curstat]); |
946 |
if ((p = (int *) calloc(setcnt+1, sizeof(int))) == NULL) |
947 |
overflo("out of space in cgoto"); |
948 |
|
949 |
f->posns[f->curstat] = p; |
950 |
f->gototab[s][c] = f->curstat; |
951 |
for (i = 0; i <= setcnt; i++) |
952 |
p[i] = tmpset[i]; |
953 |
if (setvec[f->accept]) |
954 |
f->out[f->curstat] = 1; |
955 |
else |
956 |
f->out[f->curstat] = 0; |
957 |
return f->curstat; |
958 |
} |
959 |
|
960 |
|
961 |
void freefa(fa *f) /* free a finite automaton */ |
962 |
{ |
963 |
int i; |
964 |
|
965 |
if (f == NULL) |
966 |
return; |
967 |
for (i = 0; i <= f->curstat; i++) |
968 |
xfree(f->posns[i]); |
969 |
for (i = 0; i <= f->accept; i++) { |
970 |
xfree(f->re[i].lfollow); |
971 |
if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL) |
972 |
xfree((f->re[i].lval.np)); |
973 |
} |
974 |
xfree(f->restr); |
975 |
xfree(f); |
976 |
} |