2 * node.c -- routines for node management
6 * Copyright (C) 1986, 1988, 1989, 1991-2001, 2003-2011,
7 * the Free Software Foundation, Inc.
9 * This file is part of GAWK, the GNU implementation of the
10 * AWK Programming Language.
12 * GAWK is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 3 of the License, or
15 * (at your option) any later version.
17 * GAWK is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
30 static int is_ieee_magic_val(const char *val);
31 static AWKNUM get_ieee_magic_val(const char *val);
33 /* force_number --- force a value to be numeric */
36 r_force_number(NODE *n)
42 unsigned int newflags;
43 extern double strtod();
45 if (n->flags & NUMCUR)
48 /* all the conditionals are an attempt to avoid the expensive strtod */
50 /* Note: only set NUMCUR if we actually convert some digits */
61 * POSIX, by way of severe language lawyering, seems to
62 * allow things like "inf" and "nan" to mean something.
63 * So if do_posix, the user gets what he deserves.
64 * This also allows hexadecimal floating point. Ugh.
67 if (isalpha((unsigned char) *cp)) {
69 } else if (n->stlen == 4 && is_ieee_magic_val(n->stptr)) {
70 if (n->flags & MAYBE_NUM)
71 n->flags &= ~MAYBE_NUM;
72 n->flags |= NUMBER|NUMCUR;
73 n->numbr = get_ieee_magic_val(n->stptr);
83 cpend = cp + n->stlen;
84 while (cp < cpend && isspace((unsigned char) *cp))
87 if ( cp == cpend /* only spaces, or */
88 || (! do_posix /* not POSIXLY paranoid and */
89 && (isalpha((unsigned char) *cp) /* letter, or */
90 /* CANNOT do non-decimal and saw 0x */
91 || (! do_non_decimal_data && cp[0] == '0'
92 && (cp[1] == 'x' || cp[1] == 'X'))))) {
96 if (n->flags & MAYBE_NUM) {
98 n->flags &= ~MAYBE_NUM;
102 if (cpend - cp == 1) { /* only one character */
103 if (isdigit((unsigned char) *cp)) { /* it's a digit! */
104 n->numbr = (AWKNUM)(*cp - '0');
105 n->flags |= newflags;
111 if (do_non_decimal_data) { /* main.c assures false if do_posix */
113 if (! do_traditional && isnondecimal(cp, TRUE)) {
114 n->numbr = nondec2awknum(cp, cpend - cp);
124 n->numbr = (AWKNUM) strtod((const char *) cp, &ptr);
126 /* POSIX says trailing space is OK for NUMBER */
127 while (isspace((unsigned char) *ptr))
131 if (errno == 0 && ptr == cpend) {
132 n->flags |= newflags;
143 * The following lookup table is used as an optimization in force_string;
144 * (more complicated) variations on this theme didn't seem to pay off, but
145 * systematic testing might be in order at some point.
147 static const char *values[] = {
159 #define NVAL (sizeof(values)/sizeof(values[0]))
161 /* format_val --- format a numeric value based on format */
164 format_val(const char *format, int index, NODE *s)
169 char *orig, *trans, save;
171 if (! do_traditional && (s->flags & INTLSTR) != 0) {
172 save = s->stptr[s->stlen];
173 s->stptr[s->stlen] = '\0';
176 trans = dgettext(TEXTDOMAIN, orig);
178 s->stptr[s->stlen] = save;
179 return make_string(trans, strlen(trans));
183 * 2/2007: Simplify our lives here. Instead of worrying about
184 * whether or not the value will fit into a long just so we
185 * can use sprintf("%ld", val) on it, always format it ourselves.
186 * The only thing to worry about is that integral values always
187 * format as integers. %.0f does that very well.
189 * 6/2008: Would that things were so simple. Always using %.0f
190 * imposes a notable performance penalty for applications that
191 * do a lot of conversion of integers to strings. So, we reinstate
192 * the old code, but use %.0f for integral values that are outside
193 * the range of a long. This seems a reasonable compromise.
195 * 12/2009: Use <= and >= in the comparisons with LONG_xxx instead of
196 * < and > so that things work correctly on systems with 64 bit integers.
199 /* not an integral value, or out of range */
200 if ((val = double_to_int(s->numbr)) != s->numbr
201 || val <= LONG_MIN || val >= LONG_MAX) {
203 * Once upon a time, we just blindly did this:
204 * sprintf(sp, format, s->numbr);
205 * s->stlen = strlen(sp);
206 * s->stfmt = (char) index;
207 * but that's no good if, e.g., OFMT is %s. So we punt,
208 * and just always format the value ourselves.
212 unsigned short oflags;
213 extern NODE **fmt_list; /* declared in eval.c */
215 /* create dummy node for a sole use of format_tree */
218 if (val == s->numbr) {
219 /* integral value, but outside range of %ld, use %.0f */
220 r = format_tree("%.0f", 4, dummy, 2);
223 r = format_tree(format, fmt_list[index]->stlen, dummy, 2);
225 s->stfmt = (char) index;
229 if ((s->flags & STRCUR) != 0)
232 freenode(r); /* Do not unref(r)! We want to keep s->stptr == r->stpr. */
238 * force conversion to long only once
240 long num = (long) val;
242 if (num < NVAL && num >= 0) {
243 sp = (char *) values[num];
246 (void) sprintf(sp, "%ld", num);
247 s->stlen = strlen(sp);
251 if (s->stptr != NULL)
253 emalloc(s->stptr, char *, s->stlen + 2, "format_val");
254 memcpy(s->stptr, sp, s->stlen+1);
261 /* force_string --- force a value to be a string */
264 r_force_string(NODE *s)
266 if ((s->flags & STRCUR) != 0
267 && (s->stfmt == -1 || s->stfmt == CONVFMTidx)
270 return format_val(CONVFMT, CONVFMTidx, s);
273 /* dupnode --- duplicate a node */
280 if (n->type == Node_ahash) {
285 assert(n->type == Node_val);
287 if ((n->flags & PERM) != 0)
290 if ((n->flags & MALLOC) != 0) {
302 * DON'T call free_wstr(r) here!
303 * r->wstptr still points at n->wstptr's value, and we
304 * don't want to free it!
308 #endif /* MBS_SUPPORT */
310 if ((n->flags & STRCUR) != 0) {
311 emalloc(r->stptr, char *, n->stlen + 2, "dupnode");
312 memcpy(r->stptr, n->stptr, n->stlen);
313 r->stptr[n->stlen] = '\0';
315 if ((n->flags & WSTRCUR) != 0) {
316 r->wstlen = n->wstlen;
317 emalloc(r->wstptr, wchar_t *, sizeof(wchar_t) * (n->wstlen + 2), "dupnode");
318 memcpy(r->wstptr, n->wstptr, n->wstlen * sizeof(wchar_t));
319 r->wstptr[n->wstlen] = L'\0';
322 #endif /* MBS_SUPPORT */
328 /* mk_number --- allocate a node with defined number */
331 mk_number(AWKNUM x, unsigned int flags)
346 /* make_str_node --- make a string node */
349 r_make_str_node(const char *s, unsigned long len, int flags)
356 r->flags = (STRING|STRCUR|MALLOC);
360 #endif /* MBS_SUPPORT */
362 if (flags & ALREADY_MALLOCED)
363 r->stptr = (char *) s;
365 emalloc(r->stptr, char *, len + 2, "make_str_node");
366 memcpy(r->stptr, s, len);
368 r->stptr[len] = '\0';
370 if ((flags & SCAN) != 0) { /* scan for escape sequences */
378 memset(& cur_state, 0, sizeof(cur_state));
381 end = &(r->stptr[len]);
382 for (pf = ptm = r->stptr; pf < end;) {
385 * Keep multibyte characters together. This avoids
386 * problems if a subsequent byte of a multibyte
387 * character happens to be a backslash.
389 if (gawk_mb_cur_max > 1) {
390 int mblen = mbrlen(pf, end-pf, &cur_state);
395 for (i = 0; i < mblen; i++)
403 c = parse_escape(&pf);
406 lintwarn(_("backslash at end of string"));
413 len = ptm - r->stptr;
414 erealloc(r->stptr, char *, len + 1, "make_str_node");
415 r->stptr[len] = '\0';
426 /* more_nodes --- allocate more nodes */
428 #define NODECHUNK 100
430 NODE *nextfree = NULL;
437 /* get more nodes and initialize list */
438 emalloc(nextfree, NODE *, NODECHUNK * sizeof(NODE), "more_nodes");
439 memset(nextfree, 0, NODECHUNK * sizeof(NODE));
440 for (np = nextfree; np <= &nextfree[NODECHUNK - 1]; np++) {
446 nextfree = nextfree->nextp;
450 /* unref --- remove reference to a particular node */
457 if ((tmp->flags & PERM) != 0)
460 if (tmp->type == Node_ahash) {
461 if (tmp->ahname_ref > 1)
464 efree(tmp->ahname_str);
470 if ((tmp->flags & MALLOC) != 0) {
471 if (tmp->valref > 1) {
475 if (tmp->flags & STRCUR)
486 * Parse a C escape sequence. STRING_PTR points to a variable containing a
487 * pointer to the string to parse. That pointer is updated past the
488 * characters we use. The value of the escape sequence is returned.
490 * A negative value means the sequence \ newline was seen, which is supposed to
491 * be equivalent to nothing at all.
493 * If \ is followed by a null character, we return a negative value and leave
494 * the string pointer pointing at the null character.
496 * If \ is followed by 000, we return 0 and leave the string pointer after the
497 * zeros. A value of 0 does not mean end of string.
499 * POSIX doesn't allow \x.
503 parse_escape(const char **string_ptr)
505 int c = *(*string_ptr)++;
517 warning(_("old awk does not support the `\\%c' escape sequence"), c);
552 while (++count < 3) {
553 if ((c = *(*string_ptr)++) >= '0' && c <= '7') {
564 static short warned = FALSE;
568 lintwarn(_("POSIX does not allow `\\x' escapes"));
573 if (! isxdigit((unsigned char) (*string_ptr)[0])) {
574 warning(_("no hex digits in `\\x' escape sequence"));
580 /* do outside test to avoid multiple side effects */
581 c = *(*string_ptr)++;
595 if (do_lint && j > 2)
596 lintwarn(_("hex escape \\x%.*s of %d characters probably not interpreted the way you expect"), j, start, j);
603 static short warned[256];
604 unsigned char uc = (unsigned char) c;
606 /* N.B.: use unsigned char here to avoid Latin-1 problems */
611 warning(_("escape sequence `\\%c' treated as plain `%c'"), uc, uc);
618 /* isnondecimal --- return true if number is not a decimal number */
621 isnondecimal(const char *str, int use_locale)
624 #if defined(HAVE_LOCALE_H)
626 * loc.decimal_point may not have been initialized yet,
627 * so double check it before using it.
629 if (use_locale && loc.decimal_point != NULL && loc.decimal_point[0] != '\0')
630 dec_point = loc.decimal_point[0]; /* XXX --- assumes one char */
636 /* leading 0x or 0X */
637 if (str[1] == 'x' || str[1] == 'X')
641 * Numbers with '.', 'e', or 'E' are decimal.
642 * Have to check so that things like 00.34 are handled right.
644 * These beasts can have trailing whitespace. Deal with that too.
646 for (; *str != '\0'; str++) {
647 if (*str == 'e' || *str == 'E' || *str == dec_point)
649 else if (! isdigit((unsigned char) *str))
657 /* str2wstr --- convert a multibyte string to a wide string */
660 str2wstr(NODE *n, size_t **ptr)
662 size_t i, count, src_count;
666 static short warned = FALSE;
668 assert((n->flags & (STRING|STRCUR)) != 0);
671 * Don't convert global null string or global null field
672 * variables to a wide string. They are both zero-length anyway.
673 * This also avoids future double-free errors while releasing
674 * shallow copies, eg. *tmp = *Null_field; free_wstr(tmp);
676 if (n == Nnull_string || n == Null_field)
679 if ((n->flags & WSTRCUR) != 0) {
683 fall through and recompute to fill in the array */
688 * After consideration and consultation, this
689 * code trades space for time. We allocate
690 * an array of wchar_t that is n->stlen long.
691 * This is needed in the worst case anyway, where
692 * each input byte maps to one wchar_t. The
693 * advantage is that we only have to convert the string
694 * once, instead of twice, once to find out how many
695 * wide characters, and then again to actually fill in
696 * the info. If there's a lot left over, we can
697 * realloc the wide string down in size.
700 emalloc(n->wstptr, wchar_t *, sizeof(wchar_t) * (n->stlen + 2), "str2wstr");
704 * For use by do_match, create and fill in an array.
705 * For each byte `i' in n->stptr (the original string),
706 * a[i] is equal to `j', where `j' is the corresponding wchar_t
707 * in the converted wide string.
712 emalloc(*ptr, size_t *, sizeof(size_t) * n->stlen, "str2wstr");
713 memset(*ptr, 0, sizeof(size_t) * n->stlen);
717 src_count = n->stlen;
718 memset(& mbs, 0, sizeof(mbs));
719 for (i = 0; src_count > 0; i++) {
721 * 9/2010: Check the current byte; if it's a valid character,
722 * then it doesn't start a multibyte sequence. This brings a
723 * big speed up. Thanks to Ulrich Drepper for the tip.
724 * 11/2010: Thanks to Paolo Bonzini for some even faster code.
726 if (is_valid_character(*sp)) {
728 wc = btowc_cache(*sp);
730 count = mbrtowc(& wc, sp, src_count, & mbs);
735 * Just skip the bad byte and keep going, so that
736 * we get a more-or-less full string, instead of
737 * stopping early. This is particularly important
738 * for match() where we need to build the indices.
743 * mbrtowc(3) says the state of mbs becomes undefined
744 * after a bad character, so reset it.
746 memset(& mbs, 0, sizeof(mbs));
747 /* And warn the user something's wrong */
748 if (do_lint && ! warned) {
750 lintwarn(_("Invalid multibyte data detected. There may be a mismatch between your data and your locale."));
762 (*ptr)[sp - n->stptr] = i;
770 n->wstlen = wsp - n->wstptr;
772 #define ARBITRARY_AMOUNT_TO_GIVE_BACK 100
773 if (n->stlen - n->wstlen > ARBITRARY_AMOUNT_TO_GIVE_BACK)
774 erealloc(n->wstptr, wchar_t *, sizeof(wchar_t) * (n->wstlen + 2), "str2wstr");
779 /* wstr2str --- convert a wide string back into multibyte one */
790 assert(n->valref == 1);
791 assert((n->flags & WSTRCUR) != 0);
794 * Convert the wide chars in t1->wstptr back into m.b. chars.
795 * This is pretty grotty, but it's the most straightforward
798 memset(& mbs, 0, sizeof(mbs));
801 emalloc(newval, char *, (length * gawk_mb_cur_max) + 2, "wstr2str");
804 for (cp = newval; length > 0; length--) {
805 result = wcrtomb(cp, *wp, & mbs);
806 if (result == (size_t) -1) /* what to do? break seems best */
815 n->stlen = cp - newval;
820 /* free_wstr --- release the wide string part of a node */
825 assert(n->type == Node_val);
827 if ((n->flags & WSTRCUR) != 0) {
828 assert(n->wstptr != NULL);
833 n->flags &= ~WSTRCUR;
836 static void __attribute__ ((unused))
837 dump_wstr(FILE *fp, const wchar_t *str, size_t len)
839 if (str == NULL || len == 0)
846 /* wstrstr --- walk haystack, looking for needle, wide char version */
849 wstrstr(const wchar_t *haystack, size_t hs_len,
850 const wchar_t *needle, size_t needle_len)
854 if (haystack == NULL || needle == NULL || needle_len > hs_len)
857 for (i = 0; i < hs_len; i++) {
858 if (haystack[i] == needle[0]
859 && i+needle_len-1 < hs_len
860 && haystack[i+needle_len-1] == needle[needle_len-1]) {
861 /* first & last chars match, check string */
862 if (memcmp(haystack+i, needle, sizeof(wchar_t) * needle_len) == 0) {
871 /* wcasestrstr --- walk haystack, nocase look for needle, wide char version */
874 wcasestrstr(const wchar_t *haystack, size_t hs_len,
875 const wchar_t *needle, size_t needle_len)
879 if (haystack == NULL || needle == NULL || needle_len > hs_len)
882 for (i = 0; i < hs_len; i++) {
883 if (towlower(haystack[i]) == towlower(needle[0])
884 && i+needle_len-1 < hs_len
885 && towlower(haystack[i+needle_len-1]) == towlower(needle[needle_len-1])) {
886 /* first & last chars match, check string */
887 const wchar_t *start;
890 for (j = 0; j < needle_len; j++, start++) {
893 h = towlower(*start);
894 n = towlower(needle[j]);
905 #endif /* MBS_SUPPORT */
907 /* is_ieee_magic_val --- return true for +inf, -inf, +nan, -nan */
910 is_ieee_magic_val(const char *val)
913 * Avoid strncasecmp: it mishandles ASCII bytes in some locales.
914 * Assume the length is 4, as the caller checks this.
916 return ( (val[0] == '+' || val[0] == '-')
917 && ( ( (val[1] == 'i' || val[1] == 'I')
918 && (val[2] == 'n' || val[2] == 'N')
919 && (val[3] == 'f' || val[3] == 'F'))
920 || ( (val[1] == 'n' || val[1] == 'N')
921 && (val[2] == 'a' || val[2] == 'A')
922 && (val[3] == 'n' || val[3] == 'N'))));
925 /* get_ieee_magic_val --- return magic value for string */
928 get_ieee_magic_val(const char *val)
930 static short first = TRUE;
935 AWKNUM v = strtod(val, &ptr);
937 if (val == ptr) { /* Older strtod implementations don't support inf or nan. */
944 v = ((val[1] == 'i' || val[1] == 'I') ? inf : nan);
953 wint_t btowc_cache[256];
955 /* init_btowc_cache --- initialize the cache */
957 void init_btowc_cache()
961 for (i = 0; i < 255; i++) {
962 btowc_cache[i] = btowc(i);