node.c

   1 /*
   2  * node.c -- routines for node management
   3  */
   4
   5 /*
   6  * Copyright (C) 1986, 1988, 1989, 1991-2001, 2003-2011,
   7  * the Free Software Foundation, Inc.
   8  *
   9  * This file is part of GAWK, the GNU implementation of the
  10  * AWK Programming Language.
  11  *
  12  * GAWK is free software; you can redistribute it and/or modify
  13  * it under the terms of the GNU General Public License as published by
  14  * the Free Software Foundation; either version 3 of the License, or
  15  * (at your option) any later version.
  16  *
  17  * GAWK is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20  * GNU General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU General Public License
  23  * along with this program; if not, write to the Free Software
  24  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
  25  */
  26
  27 #include "awk.h"
  28 #include "math.h"
  29
  30 static int is_ieee_magic_val(const char *val);
  31 static AWKNUM get_ieee_magic_val(const char *val);
  32
  33 /* force_number --- force a value to be numeric */
  34
  35 AWKNUM
  36 r_force_number(NODE *n)
  37 {
  38         char *cp;
  39         char *cpend;
  40         char save;
  41         char *ptr;
  42         unsigned int newflags;
  43         extern double strtod();
  44
  45         if (n->flags & NUMCUR)
  46                 return n->numbr;
  47
  48         /* all the conditionals are an attempt to avoid the expensive strtod */
  49
  50         /* Note: only set NUMCUR if we actually convert some digits */
  51
  52         n->numbr = 0.0;
  53
  54         if (n->stlen == 0) {
  55                 return 0.0;
  56         }
  57
  58         cp = n->stptr;
  59         /*
  60          * 2/2007:
  61          * POSIX, by way of severe language lawyering, seems to
  62          * allow things like "inf" and "nan" to mean something.
  63          * So if do_posix, the user gets what he deserves.
  64          * This also allows hexadecimal floating point. Ugh.
  65          */
  66         if (! do_posix) {
  67                 if (isalpha((unsigned char) *cp)) {
  68                         return 0.0;
  69                 } else if (n->stlen == 4 && is_ieee_magic_val(n->stptr)) {
  70                         if (n->flags & MAYBE_NUM)
  71                                 n->flags &= ~MAYBE_NUM;
  72                         n->flags |= NUMBER|NUMCUR;
  73                         n->numbr = get_ieee_magic_val(n->stptr);
  74
  75                         return n->numbr;
  76                 }
  77                 /* else
  78                         fall through */
  79         }
  80         /* else not POSIX, so
  81                 fall through */
  82
  83         cpend = cp + n->stlen;
  84         while (cp < cpend && isspace((unsigned char) *cp))
  85                 cp++;
  86
  87         if (   cp == cpend              /* only spaces, or */
  88             || (! do_posix              /* not POSIXLY paranoid and */
  89                 && (isalpha((unsigned char) *cp)        /* letter, or */
  90                                         /* CANNOT do non-decimal and saw 0x */
  91                     || (! do_non_decimal_data && cp[0] == '0'
  92                         && (cp[1] == 'x' || cp[1] == 'X'))))) {
  93                 return 0.0;
  94         }
  95
  96         if (n->flags & MAYBE_NUM) {
  97                 newflags = NUMBER;
  98                 n->flags &= ~MAYBE_NUM;
  99         } else
 100                 newflags = 0;
 101
 102         if (cpend - cp == 1) {          /* only one character */
 103                 if (isdigit((unsigned char) *cp)) {     /* it's a digit! */
 104                         n->numbr = (AWKNUM)(*cp - '0');
 105                         n->flags |= newflags;
 106                         n->flags |= NUMCUR;
 107                 }
 108                 return n->numbr;
 109         }
 110
 111         if (do_non_decimal_data) {      /* main.c assures false if do_posix */
 112                 errno = 0;
 113                 if (! do_traditional && isnondecimal(cp, TRUE)) {
 114                         n->numbr = nondec2awknum(cp, cpend - cp);
 115                         n->flags |= NUMCUR;
 116                         ptr = cpend;
 117                         goto finish;
 118                 }
 119         }
 120
 121         errno = 0;
 122         save = *cpend;
 123         *cpend = '\0';
 124         n->numbr = (AWKNUM) strtod((const char *) cp, &ptr);
 125
 126         /* POSIX says trailing space is OK for NUMBER */
 127         while (isspace((unsigned char) *ptr))
 128                 ptr++;
 129         *cpend = save;
 130 finish:
 131         if (errno == 0 && ptr == cpend) {
 132                 n->flags |= newflags;
 133                 n->flags |= NUMCUR;
 134         } else {
 135                 errno = 0;
 136         }
 137
 138         return n->numbr;
 139 }
 140
 141
 142 /*
 143  * The following lookup table is used as an optimization in force_string;
 144  * (more complicated) variations on this theme didn't seem to pay off, but
 145  * systematic testing might be in order at some point.
 146  */
 147 static const char *values[] = {
 148         "0",
 149         "1",
 150         "2",
 151         "3",
 152         "4",
 153         "5",
 154         "6",
 155         "7",
 156         "8",
 157         "9",
 158 };
 159 #define NVAL    (sizeof(values)/sizeof(values[0]))
 160
 161 /* format_val --- format a numeric value based on format */
 162
 163 NODE *
 164 format_val(const char *format, int index, NODE *s)
 165 {
 166         char buf[BUFSIZ];
 167         char *sp = buf;
 168         double val;
 169         char *orig, *trans, save;
 170
 171         if (! do_traditional && (s->flags & INTLSTR) != 0) {
 172                 save = s->stptr[s->stlen];
 173                 s->stptr[s->stlen] = '\0';
 174
 175                 orig = s->stptr;
 176                 trans = dgettext(TEXTDOMAIN, orig);
 177
 178                 s->stptr[s->stlen] = save;
 179                 return make_string(trans, strlen(trans));
 180         }
 181
 182         /*
 183          * 2/2007: Simplify our lives here. Instead of worrying about
 184          * whether or not the value will fit into a long just so we
 185          * can use sprintf("%ld", val) on it, always format it ourselves.
 186          * The only thing to worry about is that integral values always
 187          * format as integers. %.0f does that very well.
 188          *
 189          * 6/2008: Would that things were so simple. Always using %.0f
 190          * imposes a notable performance penalty for applications that
 191          * do a lot of conversion of integers to strings. So, we reinstate
 192          * the old code, but use %.0f for integral values that are outside
 193          * the range of a long.  This seems a reasonable compromise.
 194          *
 195          * 12/2009: Use <= and >= in the comparisons with LONG_xxx instead of
 196          * < and > so that things work correctly on systems with 64 bit integers.
 197          */
 198
 199         /* not an integral value, or out of range */
 200         if ((val = double_to_int(s->numbr)) != s->numbr
 201             || val <= LONG_MIN || val >= LONG_MAX) {
 202                 /*
 203                  * Once upon a time, we just blindly did this:
 204                  *      sprintf(sp, format, s->numbr);
 205                  *      s->stlen = strlen(sp);
 206                  *      s->stfmt = (char) index;
 207                  * but that's no good if, e.g., OFMT is %s. So we punt,
 208                  * and just always format the value ourselves.
 209                  */
 210
 211                 NODE *dummy[2], *r;
 212                 unsigned short oflags;
 213                 extern NODE **fmt_list;          /* declared in eval.c */
 214
 215                 /* create dummy node for a sole use of format_tree */
 216                 dummy[1] = s;
 217                 oflags = s->flags;
 218                 if (val == s->numbr) {
 219                         /* integral value, but outside range of %ld, use %.0f */
 220                         r = format_tree("%.0f", 4, dummy, 2);
 221                         s->stfmt = -1;
 222                 } else {
 223                         r = format_tree(format, fmt_list[index]->stlen, dummy, 2);
 224                         assert(r != NULL);
 225                         s->stfmt = (char) index;
 226                 }
 227                 s->flags = oflags;
 228                 s->stlen = r->stlen;
 229                 if ((s->flags & STRCUR) != 0)
 230                         efree(s->stptr);
 231                 s->stptr = r->stptr;
 232                 freenode(r);    /* Do not unref(r)! We want to keep s->stptr == r->stpr.  */
 233
 234                 goto no_malloc;
 235         } else {
 236                 /*
 237                  * integral value
 238                  * force conversion to long only once
 239                  */
 240                 long num = (long) val;
 241
 242                 if (num < NVAL && num >= 0) {
 243                         sp = (char *) values[num];
 244                         s->stlen = 1;
 245                 } else {
 246                         (void) sprintf(sp, "%ld", num);
 247                         s->stlen = strlen(sp);
 248                 }
 249                 s->stfmt = -1;
 250         }
 251         if (s->stptr != NULL)
 252                 efree(s->stptr);
 253         emalloc(s->stptr, char *, s->stlen + 2, "format_val");
 254         memcpy(s->stptr, sp, s->stlen+1);
 255 no_malloc:
 256         s->flags |= STRCUR;
 257         free_wstr(s);
 258         return s;
 259 }
 260
 261 /* force_string --- force a value to be a string */
 262
 263 NODE *
 264 r_force_string(NODE *s)
 265 {
 266         if ((s->flags & STRCUR) != 0
 267                     && (s->stfmt == -1 || s->stfmt == CONVFMTidx)
 268         )
 269                 return s;
 270         return format_val(CONVFMT, CONVFMTidx, s);
 271 }
 272
 273 /* dupnode --- duplicate a node */
 274
 275 NODE *
 276 dupnode(NODE *n)
 277 {
 278         NODE *r;
 279
 280         if (n->type == Node_ahash) {
 281                 n->ahname_ref++;
 282                 return n;
 283         }
 284
 285         assert(n->type == Node_val);
 286
 287         if ((n->flags & PERM) != 0)
 288                 return n;
 289
 290         if ((n->flags & MALLOC) != 0) {
 291                 n->valref++;
 292                 return n;
 293         }
 294
 295         getnode(r);
 296         *r = *n;
 297         r->flags &= ~FIELD;
 298         r->flags |= MALLOC;
 299         r->valref = 1;
 300 #if MBS_SUPPORT
 301         /*
 302          * DON'T call free_wstr(r) here!
 303          * r->wstptr still points at n->wstptr's value, and we
 304          * don't want to free it!
 305          */
 306         r->wstptr = NULL;
 307         r->wstlen = 0;
 308 #endif /* MBS_SUPPORT */
 309
 310         if ((n->flags & STRCUR) != 0) {
 311                 emalloc(r->stptr, char *, n->stlen + 2, "dupnode");
 312                 memcpy(r->stptr, n->stptr, n->stlen);
 313                 r->stptr[n->stlen] = '\0';
 314 #if MBS_SUPPORT
 315                 if ((n->flags & WSTRCUR) != 0) {
 316                         r->wstlen = n->wstlen;
 317                         emalloc(r->wstptr, wchar_t *, sizeof(wchar_t) * (n->wstlen + 2), "dupnode");
 318                         memcpy(r->wstptr, n->wstptr, n->wstlen * sizeof(wchar_t));
 319                         r->wstptr[n->wstlen] = L'\0';
 320                         r->flags |= WSTRCUR;
 321                 }
 322 #endif /* MBS_SUPPORT */
 323         }
 324
 325         return r;
 326 }
 327
 328 /* mk_number --- allocate a node with defined number */
 329
 330 NODE *
 331 mk_number(AWKNUM x, unsigned int flags)
 332 {
 333         NODE *r;
 334
 335         getnode(r);
 336         r->type = Node_val;
 337         r->numbr = x;
 338         r->valref = 1;
 339         r->flags = flags;
 340         r->stptr = NULL;
 341         r->stlen = 0;
 342         free_wstr(r);
 343         return r;
 344 }
 345
 346 /* make_str_node --- make a string node */
 347
 348 NODE *
 349 r_make_str_node(const char *s, unsigned long len, int flags)
 350 {
 351         NODE *r;
 352
 353         getnode(r);
 354         r->type = Node_val;
 355         r->numbr = 0;
 356         r->flags = (STRING|STRCUR|MALLOC);
 357 #if MBS_SUPPORT
 358         r->wstptr = NULL;
 359         r->wstlen = 0;
 360 #endif /* MBS_SUPPORT */
 361
 362         if (flags & ALREADY_MALLOCED)
 363                 r->stptr = (char *) s;
 364         else {
 365                 emalloc(r->stptr, char *, len + 2, "make_str_node");
 366                 memcpy(r->stptr, s, len);
 367         }
 368         r->stptr[len] = '\0';
 369
 370         if ((flags & SCAN) != 0) {      /* scan for escape sequences */
 371                 const char *pf;
 372                 char *ptm;
 373                 int c;
 374                 const char *end;
 375 #if MBS_SUPPORT
 376                 mbstate_t cur_state;
 377
 378                 memset(& cur_state, 0, sizeof(cur_state));
 379 #endif
 380
 381                 end = &(r->stptr[len]);
 382                 for (pf = ptm = r->stptr; pf < end;) {
 383 #if MBS_SUPPORT
 384                         /*
 385                          * Keep multibyte characters together. This avoids
 386                          * problems if a subsequent byte of a multibyte
 387                          * character happens to be a backslash.
 388                          */
 389                         if (gawk_mb_cur_max > 1) {
 390                                 int mblen = mbrlen(pf, end-pf, &cur_state);
 391
 392                                 if (mblen > 1) {
 393                                         int i;
 394
 395                                         for (i = 0; i < mblen; i++)
 396                                                 *ptm++ = *pf++;
 397                                         continue;
 398                                 }
 399                         }
 400 #endif
 401                         c = *pf++;
 402                         if (c == '\\') {
 403                                 c = parse_escape(&pf);
 404                                 if (c < 0) {
 405                                         if (do_lint)
 406                                                 lintwarn(_("backslash at end of string"));
 407                                         c = '\\';
 408                                 }
 409                                 *ptm++ = c;
 410                         } else
 411                                 *ptm++ = c;
 412                 }
 413                 len = ptm - r->stptr;
 414                 erealloc(r->stptr, char *, len + 1, "make_str_node");
 415                 r->stptr[len] = '\0';
 416                 r->flags &= ~MALLOC;
 417                 r->flags |= PERM;
 418         }
 419         r->stlen = len;
 420         r->valref = 1;
 421         r->stfmt = -1;
 422
 423         return r;
 424 }
 425
 426 /* more_nodes --- allocate more nodes */
 427
 428 #define NODECHUNK       100
 429
 430 NODE *nextfree = NULL;
 431
 432 NODE *
 433 more_nodes()
 434 {
 435         NODE *np;
 436
 437         /* get more nodes and initialize list */
 438         emalloc(nextfree, NODE *, NODECHUNK * sizeof(NODE), "more_nodes");
 439         memset(nextfree, 0, NODECHUNK * sizeof(NODE));
 440         for (np = nextfree; np <= &nextfree[NODECHUNK - 1]; np++) {
 441                 np->nextp = np + 1;
 442         }
 443         --np;
 444         np->nextp = NULL;
 445         np = nextfree;
 446         nextfree = nextfree->nextp;
 447         return np;
 448 }
 449
 450 /* unref --- remove reference to a particular node */
 451
 452 void
 453 unref(NODE *tmp)
 454 {
 455         if (tmp == NULL)
 456                 return;
 457         if ((tmp->flags & PERM) != 0)
 458                 return;
 459
 460         if (tmp->type == Node_ahash) {
 461                 if (tmp->ahname_ref > 1)
 462                         tmp->ahname_ref--;
 463                 else {
 464                         efree(tmp->ahname_str);
 465                         freenode(tmp);
 466                 }
 467                 return;
 468         }
 469
 470         if ((tmp->flags & MALLOC) != 0) {
 471                 if (tmp->valref > 1) {
 472                         tmp->valref--;
 473                         return;
 474                 }
 475                 if (tmp->flags & STRCUR)
 476                         efree(tmp->stptr);
 477         }
 478         free_wstr(tmp);
 479         freenode(tmp);
 480 }
 481
 482
 483 /*
 484  * parse_escape:
 485  *
 486  * Parse a C escape sequence.  STRING_PTR points to a variable containing a
 487  * pointer to the string to parse.  That pointer is updated past the
 488  * characters we use.  The value of the escape sequence is returned.
 489  *
 490  * A negative value means the sequence \ newline was seen, which is supposed to
 491  * be equivalent to nothing at all.
 492  *
 493  * If \ is followed by a null character, we return a negative value and leave
 494  * the string pointer pointing at the null character.
 495  *
 496  * If \ is followed by 000, we return 0 and leave the string pointer after the
 497  * zeros.  A value of 0 does not mean end of string.
 498  *
 499  * POSIX doesn't allow \x.
 500  */
 501
 502 int
 503 parse_escape(const char **string_ptr)
 504 {
 505         int c = *(*string_ptr)++;
 506         int i;
 507         int count;
 508         int j;
 509         const char *start;
 510
 511         if (do_lint_old) {
 512                 switch (c) {
 513                 case 'a':
 514                 case 'b':
 515                 case 'f':
 516                 case 'r':
 517                         warning(_("old awk does not support the `\\%c' escape sequence"), c);
 518                         break;
 519                 }
 520         }
 521
 522         switch (c) {
 523         case 'a':
 524                 return '\a';
 525         case 'b':
 526                 return '\b';
 527         case 'f':
 528                 return '\f';
 529         case 'n':
 530                 return '\n';
 531         case 'r':
 532                 return '\r';
 533         case 't':
 534                 return '\t';
 535         case 'v':
 536                 return '\v';
 537         case '\n':
 538                 return -2;
 539         case 0:
 540                 (*string_ptr)--;
 541                 return -1;
 542         case '0':
 543         case '1':
 544         case '2':
 545         case '3':
 546         case '4':
 547         case '5':
 548         case '6':
 549         case '7':
 550                 i = c - '0';
 551                 count = 0;
 552                 while (++count < 3) {
 553                         if ((c = *(*string_ptr)++) >= '0' && c <= '7') {
 554                                 i *= 8;
 555                                 i += c - '0';
 556                         } else {
 557                                 (*string_ptr)--;
 558                                 break;
 559                         }
 560                 }
 561                 return i;
 562         case 'x':
 563                 if (do_lint) {
 564                         static short warned = FALSE;
 565
 566                         if (! warned) {
 567                                 warned = TRUE;
 568                                 lintwarn(_("POSIX does not allow `\\x' escapes"));
 569                         }
 570                 }
 571                 if (do_posix)
 572                         return ('x');
 573                 if (! isxdigit((unsigned char) (*string_ptr)[0])) {
 574                         warning(_("no hex digits in `\\x' escape sequence"));
 575                         return ('x');
 576                 }
 577                 i = j = 0;
 578                 start = *string_ptr;
 579                 for (;; j++) {
 580                         /* do outside test to avoid multiple side effects */
 581                         c = *(*string_ptr)++;
 582                         if (isxdigit(c)) {
 583                                 i *= 16;
 584                                 if (isdigit(c))
 585                                         i += c - '0';
 586                                 else if (isupper(c))
 587                                         i += c - 'A' + 10;
 588                                 else
 589                                         i += c - 'a' + 10;
 590                         } else {
 591                                 (*string_ptr)--;
 592                                 break;
 593                         }
 594                 }
 595                 if (do_lint && j > 2)
 596                         lintwarn(_("hex escape \\x%.*s of %d characters probably not interpreted the way you expect"), j, start, j);
 597                 return i;
 598         case '\\':
 599         case '"':
 600                 return c;
 601         default:
 602         {
 603                 static short warned[256];
 604                 unsigned char uc = (unsigned char) c;
 605
 606                 /* N.B.: use unsigned char here to avoid Latin-1 problems */
 607
 608                 if (! warned[uc]) {
 609                         warned[uc] = TRUE;
 610
 611                         warning(_("escape sequence `\\%c' treated as plain `%c'"), uc, uc);
 612                 }
 613         }
 614                 return c;
 615         }
 616 }
 617
 618 /* isnondecimal --- return true if number is not a decimal number */
 619
 620 int
 621 isnondecimal(const char *str, int use_locale)
 622 {
 623         int dec_point = '.';
 624 #if defined(HAVE_LOCALE_H)
 625         /*
 626          * loc.decimal_point may not have been initialized yet,
 627          * so double check it before using it.
 628          */
 629         if (use_locale && loc.decimal_point != NULL && loc.decimal_point[0] != '\0')
 630                 dec_point = loc.decimal_point[0];       /* XXX --- assumes one char */
 631 #endif
 632
 633         if (str[0] != '0')
 634                 return FALSE;
 635
 636         /* leading 0x or 0X */
 637         if (str[1] == 'x' || str[1] == 'X')
 638                 return TRUE;
 639
 640         /*
 641          * Numbers with '.', 'e', or 'E' are decimal.
 642          * Have to check so that things like 00.34 are handled right.
 643          *
 644          * These beasts can have trailing whitespace. Deal with that too.
 645          */
 646         for (; *str != '\0'; str++) {
 647                 if (*str == 'e' || *str == 'E' || *str == dec_point)
 648                         return FALSE;
 649                 else if (! isdigit((unsigned char) *str))
 650                         break;
 651         }
 652
 653         return TRUE;
 654 }
 655
 656 #if MBS_SUPPORT
 657 /* str2wstr --- convert a multibyte string to a wide string */
 658
 659 NODE *
 660 str2wstr(NODE *n, size_t **ptr)
 661 {
 662         size_t i, count, src_count;
 663         char *sp;
 664         mbstate_t mbs;
 665         wchar_t wc, *wsp;
 666         static short warned = FALSE;
 667
 668         assert((n->flags & (STRING|STRCUR)) != 0);
 669
 670         /*
 671          * Don't convert global null string or global null field
 672          * variables to a wide string. They are both zero-length anyway.
 673          * This also avoids future double-free errors while releasing
 674          * shallow copies, eg. *tmp = *Null_field; free_wstr(tmp);
 675          */
 676         if (n == Nnull_string || n == Null_field)
 677                 return n;
 678
 679         if ((n->flags & WSTRCUR) != 0) {
 680                 if (ptr == NULL)
 681                         return n;
 682                 /* otherwise
 683                         fall through and recompute to fill in the array */
 684                 free_wstr(n);
 685         }
 686
 687         /*
 688          * After consideration and consultation, this
 689          * code trades space for time. We allocate
 690          * an array of wchar_t that is n->stlen long.
 691          * This is needed in the worst case anyway, where
 692          * each input byte maps to one wchar_t.  The
 693          * advantage is that we only have to convert the string
 694          * once, instead of twice, once to find out how many
 695          * wide characters, and then again to actually fill in
 696          * the info.  If there's a lot left over, we can
 697          * realloc the wide string down in size.
 698          */
 699
 700         emalloc(n->wstptr, wchar_t *, sizeof(wchar_t) * (n->stlen + 2), "str2wstr");
 701         wsp = n->wstptr;
 702
 703         /*
 704          * For use by do_match, create and fill in an array.
 705          * For each byte `i' in n->stptr (the original string),
 706          * a[i] is equal to `j', where `j' is the corresponding wchar_t
 707          * in the converted wide string.
 708          *
 709          * Create the array.
 710          */
 711         if (ptr != NULL) {
 712                 emalloc(*ptr, size_t *, sizeof(size_t) * n->stlen, "str2wstr");
 713                 memset(*ptr, 0, sizeof(size_t) * n->stlen);
 714         }
 715
 716         sp = n->stptr;
 717         src_count = n->stlen;
 718         memset(& mbs, 0, sizeof(mbs));
 719         for (i = 0; src_count > 0; i++) {
 720                 /*
 721                  * 9/2010: Check the current byte; if it's a valid character,
 722                  * then it doesn't start a multibyte sequence. This brings a
 723                  * big speed up. Thanks to Ulrich Drepper for the tip.
 724                  * 11/2010: Thanks to Paolo Bonzini for some even faster code.
 725                  */
 726                 if (is_valid_character(*sp)) {
 727                         count = 1;
 728                         wc = btowc_cache(*sp);
 729                 } else
 730                         count = mbrtowc(& wc, sp, src_count, & mbs);
 731                 switch (count) {
 732                 case (size_t) -2:
 733                 case (size_t) -1:
 734                         /*
 735                          * Just skip the bad byte and keep going, so that
 736                          * we get a more-or-less full string, instead of
 737                          * stopping early. This is particularly important
 738                          * for match() where we need to build the indices.
 739                          */
 740                         sp++;
 741                         src_count--;
 742                         /*
 743                          * mbrtowc(3) says the state of mbs becomes undefined
 744                          * after a bad character, so reset it.
 745                          */
 746                         memset(& mbs, 0, sizeof(mbs));
 747                         /* And warn the user something's wrong */
 748                         if (do_lint && ! warned) {
 749                                 warned = TRUE;
 750                                 lintwarn(_("Invalid multibyte data detected. There may be a mismatch between your data and your locale."));
 751                         }
 752                         break;
 753
 754                 case 0:
 755                         count = 1;
 756                         /* fall through */
 757                 default:
 758                         *wsp++ = wc;
 759                         src_count -= count;
 760                         while (count--)  {
 761                                 if (ptr != NULL)
 762                                         (*ptr)[sp - n->stptr] = i;
 763                                 sp++;
 764                         }
 765                         break;
 766                 }
 767         }
 768
 769         *wsp = L'\0';
 770         n->wstlen = wsp - n->wstptr;
 771         n->flags |= WSTRCUR;
 772 #define ARBITRARY_AMOUNT_TO_GIVE_BACK 100
 773         if (n->stlen - n->wstlen > ARBITRARY_AMOUNT_TO_GIVE_BACK)
 774                 erealloc(n->wstptr, wchar_t *, sizeof(wchar_t) * (n->wstlen + 2), "str2wstr");
 775
 776         return n;
 777 }
 778
 779 /* wstr2str --- convert a wide string back into multibyte one */
 780
 781 NODE *
 782 wstr2str(NODE *n)
 783 {
 784         size_t result;
 785         size_t length;
 786         wchar_t *wp;
 787         mbstate_t mbs;
 788         char *newval, *cp;
 789
 790         assert(n->valref == 1);
 791         assert((n->flags & WSTRCUR) != 0);
 792
 793         /*
 794          * Convert the wide chars in t1->wstptr back into m.b. chars.
 795          * This is pretty grotty, but it's the most straightforward
 796          * way to do things.
 797          */
 798         memset(& mbs, 0, sizeof(mbs));
 799
 800         length = n->wstlen;
 801         emalloc(newval, char *, (length * gawk_mb_cur_max) + 2, "wstr2str");
 802
 803         wp = n->wstptr;
 804         for (cp = newval; length > 0; length--) {
 805                 result = wcrtomb(cp, *wp, & mbs);
 806                 if (result == (size_t) -1)      /* what to do? break seems best */
 807                         break;
 808                 cp += result;
 809                 wp++;
 810         }
 811         *cp = '\0';
 812
 813         efree(n->stptr);
 814         n->stptr = newval;
 815         n->stlen = cp - newval;
 816
 817         return n;
 818 }
 819
 820 /* free_wstr --- release the wide string part of a node */
 821
 822 void
 823 free_wstr(NODE *n)
 824 {
 825         assert(n->type == Node_val);
 826
 827         if ((n->flags & WSTRCUR) != 0) {
 828                 assert(n->wstptr != NULL);
 829                 efree(n->wstptr);
 830         }
 831         n->wstptr = NULL;
 832         n->wstlen = 0;
 833         n->flags &= ~WSTRCUR;
 834 }
 835
 836 static void __attribute__ ((unused))
 837 dump_wstr(FILE *fp, const wchar_t *str, size_t len)
 838 {
 839         if (str == NULL || len == 0)
 840                 return;
 841
 842         for (; len--; str++)
 843                 putwc(*str, fp);
 844 }
 845
 846 /* wstrstr --- walk haystack, looking for needle, wide char version */
 847
 848 const wchar_t *
 849 wstrstr(const wchar_t *haystack, size_t hs_len,
 850         const wchar_t *needle, size_t needle_len)
 851 {
 852         size_t i;
 853
 854         if (haystack == NULL || needle == NULL || needle_len > hs_len)
 855                 return NULL;
 856
 857         for (i = 0; i < hs_len; i++) {
 858                 if (haystack[i] == needle[0]
 859                     && i+needle_len-1 < hs_len
 860                     && haystack[i+needle_len-1] == needle[needle_len-1]) {
 861                         /* first & last chars match, check string */
 862                         if (memcmp(haystack+i, needle, sizeof(wchar_t) * needle_len) == 0) {
 863                                 return haystack + i;
 864                         }
 865                 }
 866         }
 867
 868         return NULL;
 869 }
 870
 871 /* wcasestrstr --- walk haystack, nocase look for needle, wide char version */
 872
 873 const wchar_t *
 874 wcasestrstr(const wchar_t *haystack, size_t hs_len,
 875         const wchar_t *needle, size_t needle_len)
 876 {
 877         size_t i, j;
 878
 879         if (haystack == NULL || needle == NULL || needle_len > hs_len)
 880                 return NULL;
 881
 882         for (i = 0; i < hs_len; i++) {
 883                 if (towlower(haystack[i]) == towlower(needle[0])
 884                     && i+needle_len-1 < hs_len
 885                     && towlower(haystack[i+needle_len-1]) == towlower(needle[needle_len-1])) {
 886                         /* first & last chars match, check string */
 887                         const wchar_t *start;
 888
 889                         start = haystack+i;
 890                         for (j = 0; j < needle_len; j++, start++) {
 891                                 wchar_t h, n;
 892
 893                                 h = towlower(*start);
 894                                 n = towlower(needle[j]);
 895                                 if (h != n)
 896                                         goto out;
 897                         }
 898                         return haystack + i;
 899                 }
 900 out:    ;
 901         }
 902
 903         return NULL;
 904 }
 905 #endif /* MBS_SUPPORT */
 906
 907 /* is_ieee_magic_val --- return true for +inf, -inf, +nan, -nan */
 908
 909 static int
 910 is_ieee_magic_val(const char *val)
 911 {
 912         /*
 913          * Avoid strncasecmp: it mishandles ASCII bytes in some locales.
 914          * Assume the length is 4, as the caller checks this.
 915          */
 916         return (   (val[0] == '+' || val[0] == '-')
 917                 && (   (   (val[1] == 'i' || val[1] == 'I')
 918                         && (val[2] == 'n' || val[2] == 'N')
 919                         && (val[3] == 'f' || val[3] == 'F'))
 920                     || (   (val[1] == 'n' || val[1] == 'N')
 921                         && (val[2] == 'a' || val[2] == 'A')
 922                         && (val[3] == 'n' || val[3] == 'N'))));
 923 }
 924
 925 /* get_ieee_magic_val --- return magic value for string */
 926
 927 static AWKNUM
 928 get_ieee_magic_val(const char *val)
 929 {
 930         static short first = TRUE;
 931         static AWKNUM inf;
 932         static AWKNUM nan;
 933
 934         char *ptr;
 935         AWKNUM v = strtod(val, &ptr);
 936
 937         if (val == ptr) { /* Older strtod implementations don't support inf or nan. */
 938                 if (first) {
 939                         first = FALSE;
 940                         nan = sqrt(-1.0);
 941                         inf = -log(0.0);
 942                 }
 943
 944                 v = ((val[1] == 'i' || val[1] == 'I') ? inf : nan);
 945                 if (val[0] == '-')
 946                         v = -v;
 947         }
 948
 949         return v;
 950 }
 951
 952 #if MBS_SUPPORT
 953 wint_t btowc_cache[256];
 954
 955 /* init_btowc_cache --- initialize the cache */
 956
 957 void init_btowc_cache()
 958 {
 959         int i;
 960
 961         for (i = 0; i < 255; i++) {
 962                 btowc_cache[i] = btowc(i);
 963         }
 964 }
 965 #endif