1 /*	$OpenBSD: lex.c,v 1.35 2025/02/05 20:32:56 millert Exp $	*/
2 /****************************************************************
3 Copyright (C) Lucent Technologies 1997
4 All Rights Reserved
5 
6 Permission to use, copy, modify, and distribute this software and
7 its documentation for any purpose and without fee is hereby
8 granted, provided that the above copyright notice appear in all
9 copies and that both that the copyright notice and this
10 permission notice and warranty disclaimer appear in supporting
11 documentation, and that the name Lucent Technologies or any of
12 its entities not be used in advertising or publicity pertaining
13 to distribution of the software without specific, written prior
14 permission.
15 
16 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
17 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
18 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
19 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
21 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
22 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
23 THIS SOFTWARE.
24 ****************************************************************/
25 
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <ctype.h>
30 #include "awk.h"
31 #include "awkgram.tab.h"
32 
33 extern YYSTYPE	yylval;
34 extern bool	infunc;
35 
36 int	lineno	= 1;
37 int	bracecnt = 0;
38 int	brackcnt  = 0;
39 int	parencnt = 0;
40 
41 typedef struct Keyword {
42 	const char *word;
43 	int	sub;
44 	int	type;
45 } Keyword;
46 
47 const Keyword keywords[] = {	/* keep sorted: binary searched */
48 	{ "BEGIN",	XBEGIN,		XBEGIN },
49 	{ "END",	XEND,		XEND },
50 	{ "NF",		VARNF,		VARNF },
51 	{ "and",	FAND,		BLTIN },
52 	{ "atan2",	FATAN,		BLTIN },
53 	{ "break",	BREAK,		BREAK },
54 	{ "close",	CLOSE,		CLOSE },
55 	{ "compl",	FCOMPL,		BLTIN },
56 	{ "continue",	CONTINUE,	CONTINUE },
57 	{ "cos",	FCOS,		BLTIN },
58 	{ "delete",	DELETE,		DELETE },
59 	{ "do",		DO,		DO },
60 	{ "else",	ELSE,		ELSE },
61 	{ "exit",	EXIT,		EXIT },
62 	{ "exp",	FEXP,		BLTIN },
63 	{ "fflush",	FFLUSH,		BLTIN },
64 	{ "for",	FOR,		FOR },
65 	{ "func",	FUNC,		FUNC },
66 	{ "function",	FUNC,		FUNC },
67 	{ "gensub",	GENSUB,		GENSUB },
68 	{ "getline",	GETLINE,	GETLINE },
69 	{ "gsub",	GSUB,		GSUB },
70 	{ "if",		IF,		IF },
71 	{ "in",		IN,		IN },
72 	{ "index",	INDEX,		INDEX },
73 	{ "int",	FINT,		BLTIN },
74 	{ "length",	FLENGTH,	BLTIN },
75 	{ "log",	FLOG,		BLTIN },
76 	{ "lshift",	FLSHIFT,	BLTIN },
77 	{ "match",	MATCHFCN,	MATCHFCN },
78 	{ "mktime",	FMKTIME,	BLTIN },
79 	{ "next",	NEXT,		NEXT },
80 	{ "nextfile",	NEXTFILE,	NEXTFILE },
81 	{ "or",		FFOR,		BLTIN },
82 	{ "print",	PRINT,		PRINT },
83 	{ "printf",	PRINTF,		PRINTF },
84 	{ "rand",	FRAND,		BLTIN },
85 	{ "return",	RETURN,		RETURN },
86 	{ "rshift",	FRSHIFT,	BLTIN },
87 	{ "sin",	FSIN,		BLTIN },
88 	{ "split",	SPLIT,		SPLIT },
89 	{ "sprintf",	SPRINTF,	SPRINTF },
90 	{ "sqrt",	FSQRT,		BLTIN },
91 	{ "srand",	FSRAND,		BLTIN },
92 	{ "strftime",	FSTRFTIME,	BLTIN },
93 	{ "sub",	SUB,		SUB },
94 	{ "substr",	SUBSTR,		SUBSTR },
95 	{ "system",	FSYSTEM,	BLTIN },
96 	{ "systime",	FSYSTIME,	BLTIN },
97 	{ "tolower",	FTOLOWER,	BLTIN },
98 	{ "toupper",	FTOUPPER,	BLTIN },
99 	{ "while",	WHILE,		WHILE },
100 	{ "xor",	FXOR,		BLTIN },
101 };
102 
103 #define	RET(x)	{ if(dbg)printf("lex %s\n", tokname(x)); return(x); }
104 
peek(void)105 static int peek(void)
106 {
107 	int c = input();
108 	unput(c);
109 	return c;
110 }
111 
gettok(char ** pbuf,int * psz)112 static int gettok(char **pbuf, int *psz)	/* get next input token */
113 {
114 	int c, retc;
115 	char *buf = *pbuf;
116 	int sz = *psz;
117 	char *bp = buf;
118 
119 	c = input();
120 	if (c == 0)
121 		return 0;
122 	buf[0] = c;
123 	buf[1] = 0;
124 	if (!isalnum(c) && c != '.' && c != '_')
125 		return c;
126 
127 	*bp++ = c;
128 	if (isalpha(c) || c == '_') {	/* it's a varname */
129 		for ( ; (c = input()) != 0; ) {
130 			if (bp-buf >= sz)
131 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
132 					FATAL( "out of space for name %.10s...", buf );
133 			if (isalnum(c) || c == '_')
134 				*bp++ = c;
135 			else {
136 				*bp = 0;
137 				unput(c);
138 				break;
139 			}
140 		}
141 		*bp = 0;
142 		retc = 'a';	/* alphanumeric */
143 	} else {	/* maybe it's a number, but could be . */
144 		char *rem;
145 		/* read input until can't be a number */
146 		for ( ; (c = input()) != 0; ) {
147 			if (bp-buf >= sz)
148 				if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
149 					FATAL( "out of space for number %.10s...", buf );
150 			if (isdigit(c) || c == 'e' || c == 'E'
151 			  || c == '.' || c == '+' || c == '-')
152 				*bp++ = c;
153 			else {
154 				unput(c);
155 				break;
156 			}
157 		}
158 		*bp = 0;
159 		strtod(buf, &rem);	/* parse the number */
160 		if (rem == buf) {	/* it wasn't a valid number at all */
161 			buf[1] = 0;	/* return one character as token */
162 			retc = (uschar)buf[0];	/* character is its own type */
163 			unputstr(rem+1); /* put rest back for later */
164 		} else {	/* some prefix was a number */
165 			unputstr(rem);	/* put rest back for later */
166 			rem[0] = 0;	/* truncate buf after number part */
167 			retc = '0';	/* type is number */
168 		}
169 	}
170 	*pbuf = buf;
171 	*psz = sz;
172 	return retc;
173 }
174 
175 int	word(char *);
176 int	string(void);
177 int	regexpr(void);
178 bool	sc	= false;	/* true => return a } right now */
179 bool	reg	= false;	/* true => return a REGEXPR now */
180 
yylex(void)181 int yylex(void)
182 {
183 	int c;
184 	static char *buf = NULL;
185 	static int bufsize = 5; /* BUG: setting this small causes core dump! */
186 
187 	if (buf == NULL && (buf = (char *) malloc(bufsize)) == NULL)
188 		FATAL( "out of space in yylex" );
189 	if (sc) {
190 		sc = false;
191 		RET('}');
192 	}
193 	if (reg) {
194 		reg = false;
195 		return regexpr();
196 	}
197 	for (;;) {
198 		c = gettok(&buf, &bufsize);
199 		if (c == 0)
200 			return 0;
201 		if (isalpha(c) || c == '_')
202 			return word(buf);
203 		if (isdigit(c)) {
204 			char *cp = tostring(buf);
205 			double result;
206 
207 			if (is_number(cp, & result))
208 				yylval.cp = setsymtab(buf, cp, result, CON|NUM, symtab);
209 			else
210 				yylval.cp = setsymtab(buf, cp, 0.0, STR, symtab);
211 			free(cp);
212 			/* should this also have STR set? */
213 			RET(NUMBER);
214 		}
215 
216 		yylval.i = c;
217 		switch (c) {
218 		case '\n':	/* {EOL} */
219 			lineno++;
220 			RET(NL);
221 		case '\r':	/* assume \n is coming */
222 		case ' ':	/* {WS}+ */
223 		case '\t':
224 			break;
225 		case '#':	/* #.* strip comments */
226 			while ((c = input()) != '\n' && c != 0)
227 				;
228 			unput(c);
229 			break;
230 		case ';':
231 			RET(';');
232 		case '\\':
233 			if (peek() == '\n') {
234 				input();
235 				lineno++;
236 			} else if (peek() == '\r') {
237 				input(); input();	/* \n */
238 				lineno++;
239 			} else {
240 				RET(c);
241 			}
242 			break;
243 		case '&':
244 			if (peek() == '&') {
245 				input(); RET(AND);
246 			} else
247 				RET('&');
248 		case '|':
249 			if (peek() == '|') {
250 				input(); RET(BOR);
251 			} else
252 				RET('|');
253 		case '!':
254 			if (peek() == '=') {
255 				input(); yylval.i = NE; RET(NE);
256 			} else if (peek() == '~') {
257 				input(); yylval.i = NOTMATCH; RET(MATCHOP);
258 			} else
259 				RET(NOT);
260 		case '~':
261 			yylval.i = MATCH;
262 			RET(MATCHOP);
263 		case '<':
264 			if (peek() == '=') {
265 				input(); yylval.i = LE; RET(LE);
266 			} else {
267 				yylval.i = LT; RET(LT);
268 			}
269 		case '=':
270 			if (peek() == '=') {
271 				input(); yylval.i = EQ; RET(EQ);
272 			} else {
273 				yylval.i = ASSIGN; RET(ASGNOP);
274 			}
275 		case '>':
276 			if (peek() == '=') {
277 				input(); yylval.i = GE; RET(GE);
278 			} else if (peek() == '>') {
279 				input(); yylval.i = APPEND; RET(APPEND);
280 			} else {
281 				yylval.i = GT; RET(GT);
282 			}
283 		case '+':
284 			if (peek() == '+') {
285 				input(); yylval.i = INCR; RET(INCR);
286 			} else if (peek() == '=') {
287 				input(); yylval.i = ADDEQ; RET(ASGNOP);
288 			} else
289 				RET('+');
290 		case '-':
291 			if (peek() == '-') {
292 				input(); yylval.i = DECR; RET(DECR);
293 			} else if (peek() == '=') {
294 				input(); yylval.i = SUBEQ; RET(ASGNOP);
295 			} else
296 				RET('-');
297 		case '*':
298 			if (peek() == '=') {	/* *= */
299 				input(); yylval.i = MULTEQ; RET(ASGNOP);
300 			} else if (peek() == '*') {	/* ** or **= */
301 				input();	/* eat 2nd * */
302 				if (peek() == '=') {
303 					input(); yylval.i = POWEQ; RET(ASGNOP);
304 				} else {
305 					RET(POWER);
306 				}
307 			} else
308 				RET('*');
309 		case '/':
310 			RET('/');
311 		case '%':
312 			if (peek() == '=') {
313 				input(); yylval.i = MODEQ; RET(ASGNOP);
314 			} else
315 				RET('%');
316 		case '^':
317 			if (peek() == '=') {
318 				input(); yylval.i = POWEQ; RET(ASGNOP);
319 			} else
320 				RET(POWER);
321 
322 		case '$':
323 			/* BUG: awkward, if not wrong */
324 			c = gettok(&buf, &bufsize);
325 			if (isalpha(c)) {
326 				if (strcmp(buf, "NF") == 0) {	/* very special */
327 					unputstr("(NF)");
328 					RET(INDIRECT);
329 				}
330 				c = peek();
331 				if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
332 					unputstr(buf);
333 					RET(INDIRECT);
334 				}
335 				yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
336 				RET(IVAR);
337 			} else if (c == 0) {	/*  */
338 				SYNTAX( "unexpected end of input after $" );
339 				RET(';');
340 			} else {
341 				unputstr(buf);
342 				RET(INDIRECT);
343 			}
344 
345 		case '}':
346 			if (--bracecnt < 0)
347 				SYNTAX( "extra }" );
348 			sc = true;
349 			RET(';');
350 		case ']':
351 			if (--brackcnt < 0)
352 				SYNTAX( "extra ]" );
353 			RET(']');
354 		case ')':
355 			if (--parencnt < 0)
356 				SYNTAX( "extra )" );
357 			RET(')');
358 		case '{':
359 			bracecnt++;
360 			RET('{');
361 		case '[':
362 			brackcnt++;
363 			RET('[');
364 		case '(':
365 			parencnt++;
366 			RET('(');
367 
368 		case '"':
369 			return string();	/* BUG: should be like tran.c ? */
370 
371 		default:
372 			RET(c);
373 		}
374 	}
375 }
376 
string(void)377 int string(void)
378 {
379 	int c, n;
380 	char *s, *bp;
381 	static char *buf = NULL;
382 	static int bufsz = 500;
383 
384 	if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL)
385 		FATAL("out of space for strings");
386 	for (bp = buf; (c = input()) != '"'; ) {
387 		if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string"))
388 			FATAL("out of space for string %.10s...", buf);
389 		switch (c) {
390 		case '\n':
391 		case '\r':
392 		case 0:
393 			*bp = '\0';
394 			SYNTAX( "non-terminated string %.10s...", buf );
395 			if (c == 0)	/* hopeless */
396 				FATAL( "giving up" );
397 			lineno++;
398 			break;
399 		case '\\':
400 			c = input();
401 			switch (c) {
402 			case '\n': break;
403 			case '"': *bp++ = '"'; break;
404 			case 'n': *bp++ = '\n'; break;
405 			case 't': *bp++ = '\t'; break;
406 			case 'f': *bp++ = '\f'; break;
407 			case 'r': *bp++ = '\r'; break;
408 			case 'b': *bp++ = '\b'; break;
409 			case 'v': *bp++ = '\v'; break;
410 			case 'a': *bp++ = '\a'; break;
411 			case '\\': *bp++ = '\\'; break;
412 
413 			case '0': case '1': case '2': /* octal: \d \dd \ddd */
414 			case '3': case '4': case '5': case '6': case '7':
415 				n = c - '0';
416 				if ((c = peek()) >= '0' && c < '8') {
417 					n = 8 * n + input() - '0';
418 					if ((c = peek()) >= '0' && c < '8')
419 						n = 8 * n + input() - '0';
420 				}
421 				*bp++ = n;
422 				break;
423 
424 			case 'x':	/* hex  \x0-9a-fA-F (exactly two) */
425 			    {
426 				int i;
427 
428 				if (!isxdigit(peek())) {
429 					unput(c);
430 					break;
431 				}
432 				n = 0;
433 				for (i = 0; i < 2; i++) {
434 					c = input();
435 					if (c == 0)
436 						break;
437 					if (isxdigit(c)) {
438 						c = tolower(c);
439 						n *= 16;
440 						if (isdigit(c))
441 							n += (c - '0');
442 						else
443 							n += 10 + (c - 'a');
444 					} else {
445 						unput(c);
446 						break;
447 					}
448 				}
449 				if (i)
450 					*bp++ = n;
451 				break;
452 			    }
453 
454 			case 'u':	/* utf  \u0-9a-fA-F (1..8) */
455 			    {
456 				int i;
457 
458 				n = 0;
459 				for (i = 0; i < 8; i++) {
460 					c = input();
461 					if (!isxdigit(c) || c == 0)
462 						break;
463 					c = tolower(c);
464 					n *= 16;
465 					if (isdigit(c))
466 						n += (c - '0');
467 					else
468 						n += 10 + (c - 'a');
469 				}
470 				unput(c);
471 				bp += runetochar(bp, n);
472 				break;
473 			    }
474 
475 			default:
476 				*bp++ = c;
477 				break;
478 			}
479 			break;
480 		default:
481 			*bp++ = c;
482 			break;
483 		}
484 	}
485 	*bp = 0;
486 	s = tostring(buf);
487 	*bp++ = ' '; *bp++ = '\0';
488 	yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
489 	free(s);
490 	RET(STRING);
491 }
492 
493 
binsearch(char * w,const Keyword * kp,int n)494 static int binsearch(char *w, const Keyword *kp, int n)
495 {
496 	int cond, low, mid, high;
497 
498 	low = 0;
499 	high = n - 1;
500 	while (low <= high) {
501 		mid = (low + high) / 2;
502 		if ((cond = strcmp(w, kp[mid].word)) < 0)
503 			high = mid - 1;
504 		else if (cond > 0)
505 			low = mid + 1;
506 		else
507 			return mid;
508 	}
509 	return -1;
510 }
511 
word(char * w)512 int word(char *w)
513 {
514 	const Keyword *kp;
515 	int c, n;
516 
517 	n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
518 	if (n != -1) {	/* found in table */
519 		kp = keywords + n;
520 		yylval.i = kp->sub;
521 		switch (kp->type) {	/* special handling */
522 		case BLTIN:
523 			if (kp->sub == FSYSTEM && safe)
524 				SYNTAX( "system is unsafe" );
525 			RET(kp->type);
526 		case FUNC:
527 			if (infunc)
528 				SYNTAX( "illegal nested function" );
529 			RET(kp->type);
530 		case RETURN:
531 			if (!infunc)
532 				SYNTAX( "return not in function" );
533 			RET(kp->type);
534 		case VARNF:
535 			yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
536 			RET(VARNF);
537 		default:
538 			RET(kp->type);
539 		}
540 	}
541 	c = peek();	/* look for '(' */
542 	if (c != '(' && infunc && (n=isarg(w)) >= 0) {
543 		yylval.i = n;
544 		RET(ARG);
545 	} else {
546 		yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
547 		if (c == '(') {
548 			RET(CALL);
549 		} else {
550 			RET(VAR);
551 		}
552 	}
553 }
554 
startreg(void)555 void startreg(void)	/* next call to yylex will return a regular expression */
556 {
557 	reg = true;
558 }
559 
regexpr(void)560 int regexpr(void)
561 {
562 	int c, openclass = 0;
563 	static char *buf = NULL;
564 	static int bufsz = 500;
565 	char *bp, *cstart;
566 
567 	if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL)
568 		FATAL("out of space for reg expr");
569 	bp = buf;
570 	for ( ; ((c = input()) != '/' || openclass > 0) && c != 0; ) {
571 		if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr"))
572 			FATAL("out of space for reg expr %.10s...", buf);
573 		if (c == '\n') {
574 			*bp = '\0';
575 			SYNTAX( "newline in regular expression %.10s...", buf );
576 			unput('\n');
577 			break;
578 		} else if (c == '\\') {
579 			*bp++ = '\\';
580 			*bp++ = input();
581 		} else {
582 			/*
583 			 * POSIX requires a slash in a regexp to be escaped,
584 			 * other awks don't require it to be escaped inside
585 			 * a character class.
586 			 */
587 			if (!do_posix) {
588 				if (c == '[') {
589 					int nextc = peek();
590 					if (openclass == 0 || nextc == ':' ||
591 					    nextc == '.' || nextc == '=') {
592 						if (++openclass == 1)
593 							cstart = bp;
594 					}
595 				} else if (c == ']' && openclass > 0) {
596 					/*
597 					 * A ']' as the first char in a
598 					 * class is treated literally.
599 					 */
600 					if (cstart != bp - 1 &&
601 					    (cstart != bp - 2 || bp[-1] != '^'))
602 						openclass--;
603 				}
604 			}
605 			*bp++ = c;
606 		}
607 	}
608 	*bp = 0;
609 	if (c == 0)
610 		SYNTAX("non-terminated regular expression %.10s...", buf);
611 	yylval.s = tostring(buf);
612 	unput('/');
613 	RET(REGEXPR);
614 }
615 
616 /* low-level lexical stuff, sort of inherited from lex */
617 
618 char	ebuf[300];
619 char	*ep = ebuf;
620 char	yysbuf[100];	/* pushback buffer */
621 char	*yysptr = yysbuf;
622 FILE	*yyin = NULL;
623 
input(void)624 int input(void)	/* get next lexical input character */
625 {
626 	int c;
627 	extern char *lexprog;
628 
629 	if (yysptr > yysbuf)
630 		c = (uschar)*--yysptr;
631 	else if (lexprog != NULL) {	/* awk '...' */
632 		if ((c = (uschar)*lexprog) != 0)
633 			lexprog++;
634 	} else				/* awk -f ... */
635 		c = pgetc();
636 	if (c == EOF)
637 		c = 0;
638 	if (ep >= ebuf + sizeof ebuf)
639 		ep = ebuf;
640 	*ep = c;
641 	if (c != 0) {
642 		ep++;
643 	}
644 	return (c);
645 }
646 
unput(int c)647 void unput(int c)	/* put lexical character back on input */
648 {
649 	if (yysptr >= yysbuf + sizeof(yysbuf))
650 		FATAL("pushed back too much: %.20s...", yysbuf);
651 	*yysptr++ = c;
652 	if (--ep < ebuf)
653 		ep = ebuf + sizeof(ebuf) - 1;
654 }
655 
unputstr(const char * s)656 void unputstr(const char *s)	/* put a string back on input */
657 {
658 	int i;
659 
660 	for (i = strlen(s)-1; i >= 0; i--)
661 		unput(s[i]);
662 }
663