2 /********************************************
4 copyright 1991, Michael D. Brennan
6 This is a source file for mawk, an implementation of
7 the AWK programming language.
9 Mawk is distributed without warranty under the terms of
10 the GNU General Public License, version 2, 1991.
11 ********************************************/
14 *Revision 1.5 1996/11/08 15:39:27 mike
15 *While cleaning up block_on, I introduced a bug. Now fixed.
17 *Revision 1.4 1996/09/02 18:47:09 mike
18 *Allow []...] and [^]...] to put ] in a class.
19 *Make ^* and ^+ syntax errors.
21 * Revision 1.3 1994/12/26 16:37:52 mike
22 * 1.1.2 fix to do_str was incomplete -- fix it
24 * Revision 1.2 1993/07/23 13:21:38 mike
27 * Revision 1.1.1.1 1993/07/03 18:58:27 mike
30 * Revision 3.8 1992/04/21 20:22:38 brennan
32 * [c1-c2] now works if c2 is an escaped character
34 * Revision 3.7 1992/03/24 09:33:12 brennan
36 * When backing up in do_str, check if last character was escaped
38 * Revision 3.6 92/01/21 17:32:51 brennan
39 * added some casts so that character classes work with signed chars
41 * Revision 3.5 91/10/29 10:53:57 brennan
44 * Revision 3.4 91/08/13 09:10:05 brennan
47 * Revision 3.3 91/07/19 07:29:24 brennan
48 * backslash at end of regular expression now stands for backslash
50 * Revision 3.2 91/07/19 06:58:23 brennan
51 * removed small bozo in processing of escape characters
53 * Revision 3.1 91/06/07 10:33:20 brennan
56 * Revision 1.2 91/06/05 08:59:36 brennan
57 * changed RE_free to free
59 * Revision 1.1 91/06/03 07:10:15 brennan
68 /* static functions */
69 static int PROTO(do_str, (int, char **, MACHINE *)) ;
70 static int PROTO(do_class, (char **, MACHINE *)) ;
71 static int PROTO(escape, (char **)) ;
72 static BV *PROTO(store_bvp, (BV *)) ;
73 static int PROTO(ctohex, (int)) ;
77 /* make next array visible */
80 char RE_char2token[ '|' + 1 ] = {
81 0,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
82 13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,9,13,13,13,
83 6,7,3,4,13,13,10,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
84 13,13,5,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
85 13,13,13,13,13,13,13,13,13,13,11,12,13,8,13,13,13,13,13,13,
86 13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
89 #define char2token(x) \
90 ( (unsigned char)(x) > '|' ? T_CHAR : RE_char2token[x] )
92 #define NOT_STARTED (-1)
95 static char *lp ; /* ptr to reg exp string */
96 static unsigned re_len ;
104 re_len = strlen(re) + 1 ;
106 RE_run_stack_init() ;
116 switch (c = char2token(*lp))
120 if (prev == T_START) RE_error_trap(6) ;
126 lp++ ; return prev = c ;
147 return prev = T_CAT ;
155 /* *lp is an operand, but implicit cat op is possible */
167 static int plus_is_star_flag = 0 ;
177 if (plus_is_star_flag)
181 plus_is_star_flag = 0 ;
186 plus_is_star_flag = 1 ;
189 return prev = T_ANY ;
203 prev = do_str(c, &lp, mp) ;
208 prev = do_str(c, &lp, mp) ;
212 prev = do_class(&lp, mp) ;
224 return prev = T_END ;
227 RE_panic("bad switch in RE_lex") ;
232 /* don't advance the pointer */
233 return prev = T_CAT ;
236 /* check for end character */
239 mp->start->type += END_ON ;
247 Collect a run of characters into a string machine.
248 If the run ends at *,+, or ?, then don't take the last
249 character unless the string has length one.
254 int c ; /* the first character */
255 char **pp ; /* where to put the re_char pointer on exit */
256 MACHINE *mp ; /* where to put the string machine */
258 register char *p ; /* runs thru the input */
259 char *pt ; /* trails p by one */
260 char *str ; /* collect it here */
261 register char *s ; /* runs thru the output */
262 unsigned len ; /* length collected */
266 s = str = RE_malloc(re_len) ;
273 switch (char2token(*p))
282 save = p+1 ; /* keep p in a register */
283 *s++ = escape(&save) ;
294 /* if len > 1 and we stopped on a ? + or * , need to back up */
295 if (len > 1 && (*p == '*' || *p == '+' || *p == '?'))
304 *mp = RE_str((char *) RE_realloc(str, len + 1), len) ;
309 /*--------------------------------------------
310 BUILD A CHARACTER CLASS
311 *---------------------------*/
313 #define on( b, x) ((b)[(x)>>3] |= ( 1 << ((x)&7) ))
315 static void PROTO(block_on, (BV, int, int)) ;
321 /* caller makes sure x<=y and x>0 y>0 */
330 b[lo] |= (1<<(r_hi+1)) - (1<<r_lo) ;
335 for (i = lo + 1; i < hi ; i++) b[i] = 0xff ;
336 b[lo] |= (0xff << r_lo) ;
337 b[hi] |= ~(0xff << (r_hi+1)) ;
341 /* build a BV for a character class.
342 *start points at the '['
343 on exit: *start points at the character after ']'
344 mp points at a machine that recognizes the class
348 do_class(start, mp) char **start ;
358 p = t = (*start) + 1 ;
360 /* []...] puts ] in a class
361 [^]..] negates a class with ]
364 else if (*p == '^' && *(p + 1) == ']') p += 2 ;
366 while (1) /* find the back of the class */
368 if (!(q = strchr(p, ']')))
370 /* no closing bracket */
381 /* even number of \ */
387 /* q now pts at the back of the class */
391 bvp = (BV *) RE_malloc(sizeof(BV)) ;
392 memset(bvp, 0, sizeof(BV)) ;
396 comp_flag = 1 ; p++ ;
400 prev = -1 ; /* indicates - cannot be part of a range */
416 if (prev == -1 || p + 1 == q)
427 if (*p != '\\') c = *(unsigned char *) p++ ;
437 block_on(*bvp, prev, c) ;
450 prev = *(unsigned char *) p++ ;
458 for (p = (char *) bvp; p < (char *) bvp + sizeof(BV); p++)
464 /* make sure zero is off */
467 *mp = RE_class(store_bvp(bvp)) ;
472 /* storage for bit vectors so they can be reused ,
473 stored in an unsorted linear array
474 the array grows as needed
483 static BV **bv_base, **bv_limit ;
484 static BV **bv_next ; /* next empty slot in the array */
490 if (bv_next == bv_limit)
497 bv_base = (BV **) RE_malloc(BV_GROWTH * sizeof(BV *)) ;
501 t = bv_next - bv_base ;
502 bv_base = (BV **) RE_realloc(bv_base, (t + BV_GROWTH) * sizeof(BV *)) ;
505 bv_next = bv_base + t ;
506 bv_limit = bv_next + BV_GROWTH ;
509 /* put bvp in bv_next as a sentinal */
512 while (memcmp(*p, bvp, sizeof(BV))) p++ ;
521 /* we already have it */
529 /* ---------- convert escape sequences -------------*/
531 #define isoctal(x) ((x)>='0'&&(x)<='7')
534 static char hex_val['f' - 'A' + 1] =
536 10, 11, 12, 13, 14, 15, 0, 0,
537 0, 0, 0, 0, 0, 0, 0, 0,
538 0, 0, 0, 0, 0, 0, 0, 0,
539 0, 0, 0, 0, 0, 0, 0, 0,
540 10, 11, 12, 13, 14, 15} ;
542 /* interpret 1 character as hex */
549 if (c >= '0' && c <= '9') return c - '0' ;
550 if (c >= 'A' && c <= 'f' && (t = hex_val[c - 'A'])) return t ;
560 escape_test[ET_END + 1] =
575 and move the pointer forward
576 on entry *s -> at the character after the slash
577 *-------------------*/
583 register char *p = *start_p ;
584 register unsigned x ;
589 escape_test[ET_END].in = *p ;
591 while (escape_test[i].in != *p) i++ ;
594 /* in escape_test table */
596 return escape_test[i].out ;
604 x = (x << 3) + *p++ - '0' ;
605 if (isoctal(*p)) x = (x << 3) + *p++ - '0' ;
611 if (*p == 0) return '\\' ;
615 if ((x = ctohex(*p)) == NOT_HEX)
617 *start_p = p ; return 'x' ;
620 /* look for another hex digit */
621 if ((xx = ctohex(*++p)) != NOT_HEX)
623 x = (x<<4) + xx ; p++ ;
626 *start_p = p ; return x ;
629 /* anything else \c -> c */
631 return *(unsigned char *) (p - 1) ;