field.c

   1 /*
   2  * field.c - routines for dealing with fields and record parsing
   3  */
   4
   5 /*
   6  * Copyright (C) 1986, 1988, 1989, 1991-2011 the Free Software Foundation, Inc.
   7  *
   8  * This file is part of GAWK, the GNU implementation of the
   9  * AWK Programming Language.
  10  *
  11  * GAWK is free software; you can redistribute it and/or modify
  12  * it under the terms of the GNU General Public License as published by
  13  * the Free Software Foundation; either version 3 of the License, or
  14  * (at your option) any later version.
  15  *
  16  * GAWK is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19  * GNU General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU General Public License
  22  * along with this program; if not, write to the Free Software
  23  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
  24  */
  25
  26 #include "awk.h"
  27
  28 /*
  29  * In case that the system doesn't have isblank().
  30  * Don't bother with autoconf ifdef junk, just force it.
  31  * See dfa.c and regex_internal.h and regcomp.c. Bleah.
  32  */
  33 static int
  34 is_blank(int c)
  35 {
  36         return c == ' ' || c == '\t';
  37 }
  38
  39 typedef void (* Setfunc)(long, char *, long, NODE *);
  40
  41 static long (*parse_field)(long, char **, int, NODE *,
  42                              Regexp *, Setfunc, NODE *, NODE *, int);
  43 static void rebuild_record(void);
  44 static long re_parse_field(long, char **, int, NODE *,
  45                              Regexp *, Setfunc, NODE *, NODE *, int);
  46 static long def_parse_field(long, char **, int, NODE *,
  47                               Regexp *, Setfunc, NODE *, NODE *, int);
  48 static long posix_def_parse_field(long, char **, int, NODE *,
  49                               Regexp *, Setfunc, NODE *, NODE *, int);
  50 static long null_parse_field(long, char **, int, NODE *,
  51                              Regexp *, Setfunc, NODE *, NODE *, int);
  52 static long sc_parse_field(long, char **, int, NODE *,
  53                              Regexp *, Setfunc, NODE *, NODE *, int);
  54 static long fw_parse_field(long, char **, int, NODE *,
  55                              Regexp *, Setfunc, NODE *, NODE *, int);
  56 static long fpat_parse_field(long, char **, int, NODE *,
  57                              Regexp *, Setfunc, NODE *, NODE *, int);
  58 static void set_element(long num, char * str, long len, NODE *arr);
  59 static void grow_fields_arr(long num);
  60 static void set_field(long num, char *str, long len, NODE *dummy);
  61
  62 static char *parse_extent;      /* marks where to restart parse of record */
  63 static long parse_high_water = 0; /* field number that we have parsed so far */
  64 static long nf_high_water = 0;  /* size of fields_arr */
  65 static int resave_fs;
  66 static NODE *save_FS;           /* save current value of FS when line is read,
  67                                  * to be used in deferred parsing
  68                                  */
  69 static int *FIELDWIDTHS = NULL;
  70
  71 NODE **fields_arr;              /* array of pointers to the field nodes */
  72 int field0_valid;               /* $(>0) has not been changed yet */
  73 int default_FS;                 /* TRUE when FS == " " */
  74 Regexp *FS_re_yes_case = NULL;
  75 Regexp *FS_re_no_case = NULL;
  76 Regexp *FS_regexp = NULL;
  77 Regexp *FPAT_re_yes_case = NULL;
  78 Regexp *FPAT_re_no_case = NULL;
  79 Regexp *FPAT_regexp = NULL;
  80 NODE *Null_field = NULL;
  81
  82 /* init_fields --- set up the fields array to start with */
  83
  84 void
  85 init_fields()
  86 {
  87         emalloc(fields_arr, NODE **, sizeof(NODE *), "init_fields");
  88         fields_arr[0] = Nnull_string;
  89         parse_extent = fields_arr[0]->stptr;
  90         save_FS = dupnode(FS_node->var_value);
  91         getnode(Null_field);
  92         *Null_field = *Nnull_string;
  93         Null_field->flags |= FIELD;
  94         Null_field->flags &= ~(NUMCUR|NUMBER|MAYBE_NUM|PERM|MALLOC);
  95         field0_valid = TRUE;
  96 }
  97
  98 /* grow_fields --- acquire new fields as needed */
  99
 100 static void
 101 grow_fields_arr(long num)
 102 {
 103         int t;
 104         NODE *n;
 105
 106         erealloc(fields_arr, NODE **, (num + 1) * sizeof(NODE *), "grow_fields_arr");
 107         for (t = nf_high_water + 1; t <= num; t++) {
 108                 getnode(n);
 109                 *n = *Null_field;
 110                 fields_arr[t] = n;
 111         }
 112         nf_high_water = num;
 113 }
 114
 115 /* set_field --- set the value of a particular field */
 116
 117 /*ARGSUSED*/
 118 static void
 119 set_field(long num,
 120         char *str,
 121         long len,
 122         NODE *dummy ATTRIBUTE_UNUSED)   /* just to make interface same as set_element */
 123 {
 124         NODE *n;
 125
 126         if (num > nf_high_water)
 127                 grow_fields_arr(num);
 128         n = fields_arr[num];
 129         n->stptr = str;
 130         n->stlen = len;
 131         n->flags = (STRCUR|STRING|MAYBE_NUM|FIELD);
 132 }
 133
 134 /* rebuild_record --- Someone assigned a value to $(something).
 135                         Fix up $0 to be right */
 136
 137 static void
 138 rebuild_record()
 139 {
 140         /*
 141          * use explicit unsigned longs for lengths, in case
 142          * a size_t isn't big enough.
 143          */
 144         unsigned long tlen;
 145         unsigned long ofslen;
 146         NODE *tmp;
 147         NODE *ofs;
 148         char *ops;
 149         char *cops;
 150         long i;
 151
 152         assert(NF != -1);
 153
 154         tlen = 0;
 155         ofs = force_string(OFS_node->var_value);
 156         ofslen = ofs->stlen;
 157         for (i = NF; i > 0; i--) {
 158                 tmp = fields_arr[i];
 159                 tmp = force_string(tmp);
 160                 tlen += tmp->stlen;
 161         }
 162         tlen += (NF - 1) * ofslen;
 163         if ((long) tlen < 0)
 164                 tlen = 0;
 165         emalloc(ops, char *, tlen + 2, "rebuild_record");
 166         cops = ops;
 167         ops[0] = '\0';
 168         for (i = 1;  i <= NF; i++) {
 169                 free_wstr(fields_arr[i]);
 170                 tmp = fields_arr[i];
 171                 /* copy field */
 172                 if (tmp->stlen == 1)
 173                         *cops++ = tmp->stptr[0];
 174                 else if (tmp->stlen != 0) {
 175                         memcpy(cops, tmp->stptr, tmp->stlen);
 176                         cops += tmp->stlen;
 177                 }
 178                 /* copy OFS */
 179                 if (i != NF) {
 180                         if (ofslen == 1)
 181                                 *cops++ = ofs->stptr[0];
 182                         else if (ofslen != 0) {
 183                                 memcpy(cops, ofs->stptr, ofslen);
 184                                 cops += ofslen;
 185                         }
 186                 }
 187         }
 188         tmp = make_str_node(ops, tlen, ALREADY_MALLOCED);
 189
 190         /*
 191          * Since we are about to unref fields_arr[0], we want to find
 192          * any fields that still point into it, and have them point
 193          * into the new field zero.  This has to be done intelligently,
 194          * so that unrefing a field doesn't try to unref into the old $0.
 195          */
 196         for (cops = ops, i = 1; i <= NF; i++) {
 197                 if (fields_arr[i]->stlen > 0) {
 198                         NODE *n;
 199                         getnode(n);
 200
 201                         if ((fields_arr[i]->flags & FIELD) == 0) {
 202                                 *n = *Null_field;
 203                                 n->stlen = fields_arr[i]->stlen;
 204                                 if ((fields_arr[i]->flags & (NUMCUR|NUMBER)) != 0) {
 205                                         n->flags |= (fields_arr[i]->flags & (NUMCUR|NUMBER));
 206                                         n->numbr = fields_arr[i]->numbr;
 207                                 }
 208                         } else {
 209                                 *n = *(fields_arr[i]);
 210                                 n->flags &= ~(MALLOC|PERM|STRING);
 211                         }
 212
 213                         n->stptr = cops;
 214                         unref(fields_arr[i]);
 215                         fields_arr[i] = n;
 216                         assert((n->flags & WSTRCUR) == 0);
 217                 }
 218                 cops += fields_arr[i]->stlen + ofslen;
 219         }
 220
 221         unref(fields_arr[0]);
 222
 223         fields_arr[0] = tmp;
 224         field0_valid = TRUE;
 225 }
 226
 227 /*
 228  * set_record:
 229  * setup $0, but defer parsing rest of line until reference is made to $(>0)
 230  * or to NF.  At that point, parse only as much as necessary.
 231  *
 232  * Manage a private buffer for the contents of $0.  Doing so keeps us safe
 233  * if `getline var' decides to rearrange the contents of the IOBUF that
 234  * $0 might have been pointing into.  The cost is the copying of the buffer;
 235  * but better correct than fast.
 236  */
 237 void
 238 set_record(const char *buf, int cnt)
 239 {
 240         NODE *n;
 241         static char *databuf;
 242         static unsigned long databuf_size;
 243 #define INITIAL_SIZE    512
 244 #define MAX_SIZE        ((unsigned long) ~0)    /* maximally portable ... */
 245
 246         reset_record();
 247
 248         /* buffer management: */
 249         if (databuf_size == 0) {        /* first time */
 250                 emalloc(databuf, char *, INITIAL_SIZE, "set_record");
 251                 databuf_size = INITIAL_SIZE;
 252                 memset(databuf, '\0', INITIAL_SIZE);
 253
 254         }
 255         /*
 256          * Make sure there's enough room. Since we sometimes need
 257          * to place a sentinel at the end, we make sure
 258          * databuf_size is > cnt after allocation.
 259          */
 260         if (cnt >= databuf_size) {
 261                 while (cnt >= databuf_size && databuf_size <= MAX_SIZE)
 262                         databuf_size *= 2;
 263                 erealloc(databuf, char *, databuf_size, "set_record");
 264                 memset(databuf, '\0', databuf_size);
 265         }
 266         /* copy the data */
 267         memcpy(databuf, buf, cnt);
 268
 269         /* manage field 0: */
 270         unref(fields_arr[0]);
 271         getnode(n);
 272         n->stptr = databuf;
 273         n->stlen = cnt;
 274         n->valref = 1;
 275         n->type = Node_val;
 276         n->stfmt = -1;
 277         n->flags = (STRING|STRCUR|MAYBE_NUM|FIELD);
 278         fields_arr[0] = n;
 279
 280 #undef INITIAL_SIZE
 281 #undef MAX_SIZE
 282 }
 283
 284 /* reset_record --- start over again with current $0 */
 285
 286 void
 287 reset_record()
 288 {
 289         int i;
 290         NODE *n;
 291
 292         (void) force_string(fields_arr[0]);
 293
 294         NF = -1;
 295         for (i = 1; i <= parse_high_water; i++) {
 296                 unref(fields_arr[i]);
 297                 getnode(n);
 298                 *n = *Null_field;
 299                 fields_arr[i] = n;
 300         }
 301
 302         parse_high_water = 0;
 303         /*
 304          * $0 = $0 should resplit using the current value of FS.
 305          */
 306         if (resave_fs) {
 307                 resave_fs = FALSE;
 308                 unref(save_FS);
 309                 save_FS = dupnode(FS_node->var_value);
 310         }
 311
 312         field0_valid = TRUE;
 313 }
 314
 315 /* set_NF --- handle what happens to $0 and fields when NF is changed */
 316
 317 void
 318 set_NF()
 319 {
 320         int i;
 321         long nf;
 322         NODE *n;
 323
 324         assert(NF != -1);
 325
 326         nf = (long) force_number(NF_node->var_value);
 327         if (nf < 0)
 328                 fatal(_("NF set to negative value"));
 329         NF = nf;
 330
 331         if (NF > nf_high_water)
 332                 grow_fields_arr(NF);
 333         if (parse_high_water < NF) {
 334                 for (i = parse_high_water + 1; i >= 0 && i <= NF; i++) {
 335                         unref(fields_arr[i]);
 336                         getnode(n);
 337                         *n = *Null_field;
 338                         fields_arr[i] = n;
 339                 }
 340         } else if (parse_high_water > 0) {
 341                 for (i = NF + 1; i >= 0 && i <= parse_high_water; i++) {
 342                         unref(fields_arr[i]);
 343                         getnode(n);
 344                         *n = *Null_field;
 345                         fields_arr[i] = n;
 346                 }
 347                 parse_high_water = NF;
 348         }
 349         field0_valid = FALSE;
 350 }
 351
 352 /*
 353  * re_parse_field --- parse fields using a regexp.
 354  *
 355  * This is called both from get_field() and from do_split()
 356  * via (*parse_field)().  This variation is for when FS is a regular
 357  * expression -- either user-defined or because RS=="" and FS==" "
 358  */
 359 static long
 360 re_parse_field(long up_to,      /* parse only up to this field number */
 361         char **buf,     /* on input: string to parse; on output: point to start next */
 362         int len,
 363         NODE *fs ATTRIBUTE_UNUSED,
 364         Regexp *rp,
 365         Setfunc set,    /* routine to set the value of the parsed field */
 366         NODE *n,
 367         NODE *sep_arr,  /* array of field separators (maybe NULL) */
 368         int in_middle)
 369 {
 370         char *scan = *buf;
 371         long nf = parse_high_water;
 372         char *field;
 373         char *end = scan + len;
 374         int regex_flags = RE_NEED_START;
 375         char *sep;
 376 #if MBS_SUPPORT
 377         size_t mbclen = 0;
 378         mbstate_t mbs;
 379         if (gawk_mb_cur_max > 1)
 380                 memset(&mbs, 0, sizeof(mbstate_t));
 381 #endif
 382
 383         if (in_middle)
 384                 regex_flags |= RE_NO_BOL;
 385
 386         if (up_to == UNLIMITED)
 387                 nf = 0;
 388         if (len == 0)
 389                 return nf;
 390
 391         if (RS_is_null && default_FS) {
 392                 sep = scan;
 393                 while (scan < end && (*scan == ' ' || *scan == '\t' || *scan == '\n'))
 394                         scan++;
 395                 if (sep_arr != NULL && sep < scan)
 396                         set_element(nf, sep, (long)(scan - sep), sep_arr);
 397         }
 398
 399         if (rp == NULL) /* use FS */
 400                 rp = FS_regexp;
 401
 402         field = scan;
 403         while (scan < end
 404                && research(rp, scan, 0, (end - scan), regex_flags) != -1
 405                && nf < up_to) {
 406                 regex_flags |= RE_NO_BOL;
 407                 if (REEND(rp, scan) == RESTART(rp, scan)) {   /* null match */
 408 #if MBS_SUPPORT
 409                         if (gawk_mb_cur_max > 1)        {
 410                                 mbclen = mbrlen(scan, end-scan, &mbs);
 411                                 if ((mbclen == 1) || (mbclen == (size_t) -1)
 412                                         || (mbclen == (size_t) -2) || (mbclen == 0)) {
 413                                         /* We treat it as a singlebyte character.  */
 414                                         mbclen = 1;
 415                                 }
 416                                 scan += mbclen;
 417                         } else
 418 #endif
 419                         scan++;
 420                         if (scan == end) {
 421                                 (*set)(++nf, field, (long)(scan - field), n);
 422                                 up_to = nf;
 423                                 break;
 424                         }
 425                         continue;
 426                 }
 427                 (*set)(++nf, field,
 428                        (long)(scan + RESTART(rp, scan) - field), n);
 429                 if (sep_arr != NULL)
 430                         set_element(nf, scan + RESTART(rp, scan),
 431                                 (long) (REEND(rp, scan) - RESTART(rp, scan)), sep_arr);
 432                 scan += REEND(rp, scan);
 433                 field = scan;
 434                 if (scan == end)        /* FS at end of record */
 435                         (*set)(++nf, field, 0L, n);
 436         }
 437         if (nf != up_to && scan < end) {
 438                 (*set)(++nf, scan, (long)(end - scan), n);
 439                 scan = end;
 440         }
 441         *buf = scan;
 442         return nf;
 443 }
 444
 445 /*
 446  * def_parse_field --- default field parsing.
 447  *
 448  * This is called both from get_field() and from do_split()
 449  * via (*parse_field)().  This variation is for when FS is a single space
 450  * character.
 451  */
 452
 453 static long
 454 def_parse_field(long up_to,     /* parse only up to this field number */
 455         char **buf,     /* on input: string to parse; on output: point to start next */
 456         int len,
 457         NODE *fs,
 458         Regexp *rp ATTRIBUTE_UNUSED,
 459         Setfunc set,    /* routine to set the value of the parsed field */
 460         NODE *n,
 461         NODE *sep_arr,  /* array of field separators (maybe NULL) */
 462         int in_middle ATTRIBUTE_UNUSED)
 463 {
 464         char *scan = *buf;
 465         long nf = parse_high_water;
 466         char *field;
 467         char *end = scan + len;
 468         char sav;
 469         char *sep;
 470
 471         if (up_to == UNLIMITED)
 472                 nf = 0;
 473         if (len == 0)
 474                 return nf;
 475
 476         /*
 477          * Nasty special case. If FS set to "", return whole record
 478          * as first field. This is not worth a separate function.
 479          */
 480         if (fs->stlen == 0) {
 481                 (*set)(++nf, *buf, len, n);
 482                 *buf += len;
 483                 return nf;
 484         }
 485
 486         /* before doing anything save the char at *end */
 487         sav = *end;
 488         /* because it will be destroyed now: */
 489
 490         *end = ' ';     /* sentinel character */
 491         sep = scan;
 492         for (; nf < up_to; scan++) {
 493                 /*
 494                  * special case:  fs is single space, strip leading whitespace
 495                  */
 496                 while (scan < end && (*scan == ' ' || *scan == '\t' || *scan == '\n'))
 497                         scan++;
 498
 499                 if (sep_arr != NULL && scan > sep)
 500                         set_element(nf, sep, (long) (scan - sep), sep_arr);
 501
 502                 if (scan >= end)
 503                         break;
 504
 505                 field = scan;
 506
 507                 while (*scan != ' ' && *scan != '\t' && *scan != '\n')
 508                         scan++;
 509
 510                 (*set)(++nf, field, (long)(scan - field), n);
 511
 512                 if (scan == end)
 513                         break;
 514
 515                 sep = scan;
 516         }
 517
 518         /* everything done, restore original char at *end */
 519         *end = sav;
 520
 521         *buf = scan;
 522         return nf;
 523 }
 524
 525 /*
 526  * posix_def_parse_field --- default field parsing.
 527  *
 528  * This is called both from get_field() and from do_split()
 529  * via (*parse_field)().  This variation is for when FS is a single space
 530  * character.  The only difference between this and def_parse_field()
 531  * is that this one does not allow newlines to separate fields.
 532  */
 533
 534 static long
 535 posix_def_parse_field(long up_to,       /* parse only up to this field number */
 536         char **buf,     /* on input: string to parse; on output: point to start next */
 537         int len,
 538         NODE *fs,
 539         Regexp *rp ATTRIBUTE_UNUSED,
 540         Setfunc set,    /* routine to set the value of the parsed field */
 541         NODE *n,
 542         NODE *dummy ATTRIBUTE_UNUSED, /* sep_arr not needed here: hence dummy */
 543         int in_middle ATTRIBUTE_UNUSED)
 544 {
 545         char *scan = *buf;
 546         long nf = parse_high_water;
 547         char *field;
 548         char *end = scan + len;
 549         char sav;
 550
 551         if (up_to == UNLIMITED)
 552                 nf = 0;
 553         if (len == 0)
 554                 return nf;
 555
 556         /*
 557          * Nasty special case. If FS set to "", return whole record
 558          * as first field. This is not worth a separate function.
 559          */
 560         if (fs->stlen == 0) {
 561                 (*set)(++nf, *buf, len, n);
 562                 *buf += len;
 563                 return nf;
 564         }
 565
 566         /* before doing anything save the char at *end */
 567         sav = *end;
 568         /* because it will be destroyed now: */
 569
 570         *end = ' ';     /* sentinel character */
 571         for (; nf < up_to; scan++) {
 572                 /*
 573                  * special case:  fs is single space, strip leading whitespace
 574                  */
 575                 while (scan < end && (*scan == ' ' || *scan == '\t'))
 576                         scan++;
 577                 if (scan >= end)
 578                         break;
 579                 field = scan;
 580                 while (*scan != ' ' && *scan != '\t')
 581                         scan++;
 582                 (*set)(++nf, field, (long)(scan - field), n);
 583                 if (scan == end)
 584                         break;
 585         }
 586
 587         /* everything done, restore original char at *end */
 588         *end = sav;
 589
 590         *buf = scan;
 591         return nf;
 592 }
 593
 594 /*
 595  * null_parse_field --- each character is a separate field
 596  *
 597  * This is called both from get_field() and from do_split()
 598  * via (*parse_field)().  This variation is for when FS is the null string.
 599  */
 600 static long
 601 null_parse_field(long up_to,    /* parse only up to this field number */
 602         char **buf,     /* on input: string to parse; on output: point to start next */
 603         int len,
 604         NODE *fs ATTRIBUTE_UNUSED,
 605         Regexp *rp ATTRIBUTE_UNUSED,
 606         Setfunc set,    /* routine to set the value of the parsed field */
 607         NODE *n,
 608         NODE *sep_arr,  /* array of field separators (maybe NULL) */
 609         int in_middle ATTRIBUTE_UNUSED)
 610 {
 611         char *scan = *buf;
 612         long nf = parse_high_water;
 613         char *end = scan + len;
 614
 615         if (up_to == UNLIMITED)
 616                 nf = 0;
 617         if (len == 0)
 618                 return nf;
 619
 620 #if MBS_SUPPORT
 621         if (gawk_mb_cur_max > 1) {
 622                 mbstate_t mbs;
 623                 memset(&mbs, 0, sizeof(mbstate_t));
 624                 for (; nf < up_to && scan < end;) {
 625                         size_t mbclen = mbrlen(scan, end-scan, &mbs);
 626                         if ((mbclen == 1) || (mbclen == (size_t) -1)
 627                                 || (mbclen == (size_t) -2) || (mbclen == 0)) {
 628                                 /* We treat it as a singlebyte character.  */
 629                                 mbclen = 1;
 630                         }
 631                         if (sep_arr != NULL && nf > 0)
 632                                 set_element(nf, scan, 0L, sep_arr);
 633                         (*set)(++nf, scan, mbclen, n);
 634                         scan += mbclen;
 635                 }
 636         } else
 637 #endif
 638         for (; nf < up_to && scan < end; scan++) {
 639                 if (sep_arr != NULL && nf > 0)
 640                         set_element(nf, scan, 0L, sep_arr);
 641                 (*set)(++nf, scan, 1L, n);
 642         }
 643
 644         *buf = scan;
 645         return nf;
 646 }
 647
 648 /*
 649  * sc_parse_field --- single character field separator
 650  *
 651  * This is called both from get_field() and from do_split()
 652  * via (*parse_field)().  This variation is for when FS is a single character
 653  * other than space.
 654  */
 655 static long
 656 sc_parse_field(long up_to,      /* parse only up to this field number */
 657         char **buf,     /* on input: string to parse; on output: point to start next */
 658         int len,
 659         NODE *fs,
 660         Regexp *rp ATTRIBUTE_UNUSED,
 661         Setfunc set,    /* routine to set the value of the parsed field */
 662         NODE *n,
 663         NODE *sep_arr,  /* array of field separators (maybe NULL) */
 664         int in_middle ATTRIBUTE_UNUSED)
 665 {
 666         char *scan = *buf;
 667         char fschar;
 668         long nf = parse_high_water;
 669         char *field;
 670         char *end = scan + len;
 671         char sav;
 672 #if MBS_SUPPORT
 673         size_t mbclen = 0;
 674         mbstate_t mbs;
 675         if (gawk_mb_cur_max > 1)
 676                 memset(&mbs, 0, sizeof(mbstate_t));
 677 #endif
 678
 679         if (up_to == UNLIMITED)
 680                 nf = 0;
 681         if (len == 0)
 682                 return nf;
 683
 684         if (RS_is_null && fs->stlen == 0)
 685                 fschar = '\n';
 686         else
 687                 fschar = fs->stptr[0];
 688
 689         /* before doing anything save the char at *end */
 690         sav = *end;
 691         /* because it will be destroyed now: */
 692         *end = fschar;  /* sentinel character */
 693
 694         for (; nf < up_to;) {
 695                 field = scan;
 696 #if MBS_SUPPORT
 697                 if (gawk_mb_cur_max > 1) {
 698                         while (*scan != fschar) {
 699                                 mbclen = mbrlen(scan, end-scan, &mbs);
 700                                 if ((mbclen == 1) || (mbclen == (size_t) -1)
 701                                         || (mbclen == (size_t) -2) || (mbclen == 0)) {
 702                                         /* We treat it as a singlebyte character.  */
 703                                         mbclen = 1;
 704                                 }
 705                                 scan += mbclen;
 706                         }
 707                 } else
 708 #endif
 709                 while (*scan != fschar)
 710                         scan++;
 711                 (*set)(++nf, field, (long)(scan - field), n);
 712                 if (scan == end)
 713                         break;
 714                 if (sep_arr != NULL)
 715                         set_element(nf, scan, 1L, sep_arr);
 716                 scan++;
 717                 if (scan == end) {      /* FS at end of record */
 718                         (*set)(++nf, field, 0L, n);
 719                         break;
 720                 }
 721         }
 722
 723         /* everything done, restore original char at *end */
 724         *end = sav;
 725
 726         *buf = scan;
 727         return nf;
 728 }
 729
 730 /*
 731  * fw_parse_field --- field parsing using FIELDWIDTHS spec
 732  *
 733  * This is called from get_field() via (*parse_field)().
 734  * This variation is for fields are fixed widths.
 735  */
 736 static long
 737 fw_parse_field(long up_to,      /* parse only up to this field number */
 738         char **buf,     /* on input: string to parse; on output: point to start next */
 739         int len,
 740         NODE *fs ATTRIBUTE_UNUSED,
 741         Regexp *rp ATTRIBUTE_UNUSED,
 742         Setfunc set,    /* routine to set the value of the parsed field */
 743         NODE *n,
 744         NODE *dummy ATTRIBUTE_UNUSED, /* sep_arr not needed here: hence dummy */
 745         int in_middle ATTRIBUTE_UNUSED)
 746 {
 747         char *scan = *buf;
 748         long nf = parse_high_water;
 749         char *end = scan + len;
 750 #if MBS_SUPPORT
 751         int nmbc;
 752         size_t mbclen;
 753         size_t mbslen;
 754         size_t lenrest;
 755         char *mbscan;
 756         mbstate_t mbs;
 757
 758         memset(&mbs, 0, sizeof(mbstate_t));
 759 #endif
 760
 761         if (up_to == UNLIMITED)
 762                 nf = 0;
 763         if (len == 0)
 764                 return nf;
 765         for (; nf < up_to && (len = FIELDWIDTHS[nf+1]) != -1; ) {
 766 #if MBS_SUPPORT
 767                 if (gawk_mb_cur_max > 1) {
 768                         nmbc = 0;
 769                         mbslen = 0;
 770                         mbscan = scan;
 771                         lenrest = end - scan;
 772                         while (nmbc < len && mbslen < lenrest) {
 773                                 mbclen = mbrlen(mbscan, end - mbscan, &mbs);
 774                                 if (   mbclen == 1
 775                                     || mbclen == (size_t) -1
 776                                     || mbclen == (size_t) -2
 777                                     || mbclen == 0) {
 778                                         /* We treat it as a singlebyte character.  */
 779                                         mbclen = 1;
 780                                 }
 781                                 if (mbclen <= end - mbscan) {
 782                                         mbscan += mbclen;
 783                                         mbslen += mbclen;
 784                                         ++nmbc;
 785                                 }
 786                         }
 787                         (*set)(++nf, scan, (long) mbslen, n);
 788                         scan += mbslen;
 789                 }
 790                 else
 791 #endif
 792                 {
 793                         if (len > end - scan)
 794                                 len = end - scan;
 795                         (*set)(++nf, scan, (long) len, n);
 796                         scan += len;
 797                 }
 798         }
 799         if (len == -1)
 800                 *buf = end;
 801         else
 802                 *buf = scan;
 803         return nf;
 804 }
 805
 806 /* invalidate_field0 --- $0 needs reconstruction */
 807
 808 void
 809 invalidate_field0()
 810 {
 811         field0_valid = FALSE;
 812 }
 813
 814 /* get_field --- return a particular $n */
 815
 816 /* assign is not NULL if this field is on the LHS of an assign */
 817
 818 NODE **
 819 get_field(long requested, Func_ptr *assign)
 820 {
 821         int in_middle = FALSE;
 822         /*
 823          * if requesting whole line but some other field has been altered,
 824          * then the whole line must be rebuilt
 825          */
 826         if (requested == 0) {
 827                 if (! field0_valid) {
 828                         /* first, parse remainder of input record */
 829                         if (NF == -1) {
 830                                 NF = (*parse_field)(UNLIMITED - 1, &parse_extent,
 831                                         fields_arr[0]->stlen -
 832                                         (parse_extent - fields_arr[0]->stptr),
 833                                         save_FS, FS_regexp, set_field,
 834                                         (NODE *) NULL,
 835                                         (NODE *) NULL,
 836                                         in_middle);
 837                                 parse_high_water = NF;
 838                         }
 839                         rebuild_record();
 840                 }
 841                 if (assign != NULL)
 842                         *assign = reset_record;
 843                 return &fields_arr[0];
 844         }
 845
 846         /* assert(requested > 0); */
 847
 848 #if 0
 849         if (assign != NULL)
 850                 field0_valid = FALSE;           /* $0 needs reconstruction */
 851 #else
 852         /*
 853          * Keep things uniform. Also, mere intention of assigning something
 854          * to $n should not make $0 invalid. Makes sense to invalidate $0
 855          * after the actual assignment is performed. Not a real issue in
 856          * the interpreter otherwise, but causes problem in the
 857          * debugger when watching or printing fields.
 858          */
 859
 860         if (assign != NULL)
 861                 *assign = invalidate_field0;    /* $0 needs reconstruction */
 862 #endif
 863
 864         if (requested <= parse_high_water)      /* already parsed this field */
 865                 return &fields_arr[requested];
 866
 867         if (NF == -1) { /* have not yet parsed to end of record */
 868                 /*
 869                  * parse up to requested fields, calling set_field() for each,
 870                  * saving in parse_extent the point where the parse left off
 871                  */
 872                 if (parse_high_water == 0)      /* starting at the beginning */
 873                         parse_extent = fields_arr[0]->stptr;
 874                 else
 875                         in_middle = TRUE;
 876                 parse_high_water = (*parse_field)(requested, &parse_extent,
 877                      fields_arr[0]->stlen - (parse_extent - fields_arr[0]->stptr),
 878                      save_FS, NULL, set_field, (NODE *) NULL, (NODE *) NULL, in_middle);
 879
 880                 /*
 881                  * if we reached the end of the record, set NF to the number of
 882                  * fields so far.  Note that requested might actually refer to
 883                  * a field that is beyond the end of the record, but we won't
 884                  * set NF to that value at this point, since this is only a
 885                  * reference to the field and NF only gets set if the field
 886                  * is assigned to -- this case is handled below
 887                  */
 888                 if (parse_extent == fields_arr[0]->stptr + fields_arr[0]->stlen)
 889                         NF = parse_high_water;
 890                 else if (parse_field == fpat_parse_field) {
 891                         /* FPAT parsing is wierd, isolate the special cases */
 892                         char *rec_start = fields_arr[0]->stptr;
 893                         char *rec_end = fields_arr[0]->stptr + fields_arr[0]->stlen;
 894
 895                         if (    parse_extent > rec_end
 896                             || (parse_extent > rec_start && parse_extent < rec_end && requested == UNLIMITED-1))
 897                                 NF = parse_high_water;
 898                         else if (parse_extent == rec_start) /* could be no match for FPAT */
 899                                 NF = 0;
 900                 }
 901                 if (requested == UNLIMITED - 1) /* UNLIMITED-1 means set NF */
 902                         requested = parse_high_water;
 903         }
 904         if (parse_high_water < requested) { /* requested beyond end of record */
 905                 if (assign != NULL) {   /* expand record */
 906                         if (requested > nf_high_water)
 907                                 grow_fields_arr(requested);
 908
 909                         NF = requested;
 910                         parse_high_water = requested;
 911                 } else
 912                         return &Null_field;
 913         }
 914
 915         return &fields_arr[requested];
 916 }
 917
 918 /* set_element --- set an array element, used by do_split() */
 919
 920 static void
 921 set_element(long num, char *s, long len, NODE *n)
 922 {
 923         NODE *it;
 924         NODE **lhs;
 925         NODE *sub;
 926
 927         it = make_string(s, len);
 928         it->flags |= MAYBE_NUM;
 929         sub = make_number((AWKNUM) (num));
 930         lhs = assoc_lookup(n, sub, FALSE);
 931         unref(sub);
 932         unref(*lhs);
 933         *lhs = it;
 934 }
 935
 936 /* do_split --- implement split(), semantics are same as for field splitting */
 937
 938 NODE *
 939 do_split(int nargs)
 940 {
 941         NODE *src, *arr, *sep, *fs, *tmp, *sep_arr = NULL;
 942         char *s;
 943         long (*parseit)(long, char **, int, NODE *,
 944                          Regexp *, Setfunc, NODE *, NODE *, int);
 945         Regexp *rp = NULL;
 946
 947         if (nargs == 4) {
 948                 static short warned1 = FALSE, warned2 = FALSE;
 949
 950                 if (do_traditional || do_posix) {
 951                         fatal(_("split: fourth argument is a gawk extension"));
 952                 }
 953                 sep_arr = POP_PARAM();
 954                 if (sep_arr->type != Node_var_array)
 955                         fatal(_("split: fourth argument is not an array"));
 956                 if (do_lint && ! warned1) {
 957                         warned1 = TRUE;
 958                         lintwarn(_("split: fourth argument is a gawk extension"));
 959                 }
 960                 if (do_lint_old && ! warned2) {
 961                         warned2 = TRUE;
 962                         warning(_("split: fourth argument is a gawk extension"));
 963                 }
 964         }
 965
 966         sep = POP();
 967         arr = POP_PARAM();
 968         if (arr->type != Node_var_array)
 969                 fatal(_("split: second argument is not an array"));
 970
 971         if (sep_arr != NULL) {
 972                 if (sep_arr == arr)
 973                         fatal(_("split: cannot use the same array for second and fourth args"));
 974
 975                 /* This checks need to be done before clearing any of the arrays */
 976                 for (tmp = sep_arr->parent_array; tmp != NULL; tmp = tmp->parent_array)
 977                         if (tmp == arr)
 978                                 fatal(_("split: cannot use a subarray of second arg for fourth arg"));
 979                 for (tmp = arr->parent_array; tmp != NULL; tmp = tmp->parent_array)
 980                         if (tmp == sep_arr)
 981                                 fatal(_("split: cannot use a subarray of fourth arg for second arg"));
 982                 assoc_clear(sep_arr);
 983         }
 984         assoc_clear(arr);
 985
 986         src = TOP_STRING();
 987         if (src->stlen == 0) {
 988                 /*
 989                  * Skip the work if first arg is the null string.
 990                  */
 991                 decr_sp();
 992                 DEREF(src);
 993                 return make_number((AWKNUM) 0);
 994         }
 995
 996         if ((sep->re_flags & FS_DFLT) != 0 && current_field_sep() != Using_FIELDWIDTHS && ! RS_is_null) {
 997                 parseit = parse_field;
 998                 fs = force_string(FS_node->var_value);
 999                 rp = FS_regexp;
1000         } else {
1001                 fs = sep->re_exp;
1002
1003                 if (fs->stlen == 0) {
1004                         static short warned = FALSE;
1005
1006                         parseit = null_parse_field;
1007
1008                         if (do_lint && ! warned) {
1009                                 warned = TRUE;
1010                                 lintwarn(_("split: null string for third arg is a gawk extension"));
1011                         }
1012                 } else if (fs->stlen == 1 && (sep->re_flags & CONSTANT) == 0) {
1013                         if (fs->stptr[0] == ' ') {
1014                                 if (do_posix)
1015                                         parseit = posix_def_parse_field;
1016                                 else
1017                                         parseit = def_parse_field;
1018                         } else
1019                                 parseit = sc_parse_field;
1020                 } else {
1021                         parseit = re_parse_field;
1022                         rp = re_update(sep);
1023                 }
1024         }
1025
1026         s = src->stptr;
1027         tmp = make_number((AWKNUM) (*parseit)(UNLIMITED, &s, (int) src->stlen,
1028                                              fs, rp, set_element, arr, sep_arr, FALSE));
1029
1030         decr_sp();
1031         DEREF(src);
1032         return tmp;
1033 }
1034
1035 /*
1036  * do_patsplit --- implement patsplit(), semantics are same as for field
1037  *                 splitting with FPAT.
1038  */
1039
1040 NODE *
1041 do_patsplit(int nargs)
1042 {
1043         NODE *src, *arr, *sep, *fpat, *tmp, *sep_arr = NULL;
1044         char *s;
1045         Regexp *rp = NULL;
1046
1047         if (nargs == 4) {
1048                 sep_arr = POP_PARAM();
1049                 if (sep_arr->type != Node_var_array)
1050                         fatal(_("patsplit: fourth argument is not an array"));
1051         }
1052         sep = POP();
1053         arr = POP_PARAM();
1054         if (arr->type != Node_var_array)
1055                 fatal(_("patsplit: second argument is not an array"));
1056
1057         src = TOP_STRING();
1058
1059         fpat = sep->re_exp;
1060         if (fpat->stlen == 0)
1061                 fatal(_("patsplit: third argument must be non-null"));
1062
1063         if (sep_arr != NULL) {
1064                 if (sep_arr == arr)
1065                         fatal(_("patsplit: cannot use the same array for second and fourth args"));
1066
1067                 /* This checks need to be done before clearing any of the arrays */
1068                 for (tmp = sep_arr->parent_array; tmp != NULL; tmp = tmp->parent_array)
1069                         if (tmp == arr)
1070                                 fatal(_("patsplit: cannot use a subarray of second arg for fourth arg"));
1071                 for (tmp = arr->parent_array; tmp != NULL; tmp = tmp->parent_array)
1072                         if (tmp == sep_arr)
1073                                 fatal(_("patsplit: cannot use a subarray of fourth arg for second arg"));
1074                 assoc_clear(sep_arr);
1075         }
1076         assoc_clear(arr);
1077
1078         if (src->stlen == 0) {
1079                 /*
1080                  * Skip the work if first arg is the null string.
1081                  */
1082                 tmp =  make_number((AWKNUM) 0);
1083         } else {
1084                 rp = re_update(sep);
1085                 s = src->stptr;
1086                 tmp = make_number((AWKNUM) fpat_parse_field(UNLIMITED, &s,
1087                                 (int) src->stlen, fpat, rp,
1088                                 set_element, arr, sep_arr, FALSE));
1089         }
1090
1091         decr_sp();      /* 1st argument not POP-ed */
1092         DEREF(src);
1093         return tmp;
1094 }
1095
1096 /* set_FIELDWIDTHS --- handle an assignment to FIELDWIDTHS */
1097
1098 void
1099 set_FIELDWIDTHS()
1100 {
1101         char *scan;
1102         char *end;
1103         int i;
1104         static int fw_alloc = 4;
1105         static short warned = FALSE;
1106         int fatal_error = FALSE;
1107
1108         if (do_lint && ! warned) {
1109                 warned = TRUE;
1110                 lintwarn(_("`FIELDWIDTHS' is a gawk extension"));
1111         }
1112         if (do_traditional)     /* quick and dirty, does the trick */
1113                 return;
1114
1115         /*
1116          * If changing the way fields are split, obey least-suprise
1117          * semantics, and force $0 to be split totally.
1118          */
1119         if (fields_arr != NULL)
1120                 (void) get_field(UNLIMITED - 1, 0);
1121
1122         parse_field = fw_parse_field;
1123         scan = force_string(FIELDWIDTHS_node->var_value)->stptr;
1124
1125         if (FIELDWIDTHS == NULL)
1126                 emalloc(FIELDWIDTHS, int *, fw_alloc * sizeof(int), "set_FIELDWIDTHS");
1127         FIELDWIDTHS[0] = 0;
1128         for (i = 1; ; i++) {
1129                 unsigned long int tmp;
1130                 if (i + 2 >= fw_alloc) {
1131                         fw_alloc *= 2;
1132                         erealloc(FIELDWIDTHS, int *, fw_alloc * sizeof(int), "set_FIELDWIDTHS");
1133                 }
1134                 /* Initialize value to be end of list */
1135                 FIELDWIDTHS[i] = -1;
1136                 /* Ensure that there is no leading `-' sign.  Otherwise,
1137                    strtoul would accept it and return a bogus result.  */
1138                 while (is_blank(*scan)) {
1139                         ++scan;
1140                 }
1141                 if (*scan == '-') {
1142                         fatal_error = TRUE;
1143                         break;
1144                 }
1145                 if (*scan == '\0')
1146                         break;
1147
1148                 /* Detect an invalid base-10 integer, a valid value that
1149                    is followed by something other than a blank or '\0',
1150                    or a value that is not in the range [1..INT_MAX].  */
1151                 errno = 0;
1152                 tmp = strtoul(scan, &end, 10);
1153                 if (errno != 0
1154                         || (*end != '\0' && ! is_blank(*end))
1155                                 || !(0 < tmp && tmp <= INT_MAX)
1156                 ) {
1157                         fatal_error = TRUE;
1158                         break;
1159                 }
1160                 FIELDWIDTHS[i] = tmp;
1161                 scan = end;
1162                 /* Skip past any trailing blanks.  */
1163                 while (is_blank(*scan)) {
1164                         ++scan;
1165                 }
1166                 if (*scan == '\0')
1167                         break;
1168         }
1169         FIELDWIDTHS[i+1] = -1;
1170
1171         update_PROCINFO_str("FS", "FIELDWIDTHS");
1172         if (fatal_error)
1173                 fatal(_("invalid FIELDWIDTHS value, near `%s'"),
1174                               scan);
1175 }
1176
1177 /* set_FS --- handle things when FS is assigned to */
1178
1179 void
1180 set_FS()
1181 {
1182         char buf[10];
1183         NODE *fs;
1184         static NODE *save_fs = NULL;
1185         static NODE *save_rs = NULL;
1186         int remake_re = TRUE;
1187
1188         /*
1189          * If changing the way fields are split, obey least-surprise
1190          * semantics, and force $0 to be split totally.
1191          */
1192         if (fields_arr != NULL)
1193                 (void) get_field(UNLIMITED - 1, 0);
1194
1195         /* It's possible that only IGNORECASE changed, or FS = FS */
1196         /*
1197          * This comparison can't use cmp_nodes(), which pays attention
1198          * to IGNORECASE, and that's not what we want.
1199          */
1200         if (save_fs
1201                 && FS_node->var_value->stlen == save_fs->stlen
1202                 && memcmp(FS_node->var_value->stptr, save_fs->stptr, save_fs->stlen) == 0
1203                 && save_rs
1204                 && RS_node->var_value->stlen == save_rs->stlen
1205                 && memcmp(RS_node->var_value->stptr, save_rs->stptr, save_rs->stlen) == 0) {
1206                 if (FS_regexp != NULL)
1207                         FS_regexp = (IGNORECASE ? FS_re_no_case : FS_re_yes_case);
1208
1209                 /* FS = FS */
1210                 if (current_field_sep() == Using_FS) {
1211                         return;
1212                 } else {
1213                         remake_re = FALSE;
1214                         goto choose_fs_function;
1215                 }
1216         }
1217
1218         unref(save_fs);
1219         save_fs = dupnode(FS_node->var_value);
1220         unref(save_rs);
1221         save_rs = dupnode(RS_node->var_value);
1222         resave_fs = TRUE;
1223
1224         /* If FS_re_no_case assignment is fatal (make_regexp in remake_re)
1225          * FS_regexp will be NULL with a non-null FS_re_yes_case.
1226          * refree() handles null argument; no need for `if (FS_regexp != NULL)' below.
1227          * Please do not remerge.
1228          */
1229         refree(FS_re_yes_case);
1230         refree(FS_re_no_case);
1231         FS_re_yes_case = FS_re_no_case = FS_regexp = NULL;
1232
1233
1234 choose_fs_function:
1235         buf[0] = '\0';
1236         default_FS = FALSE;
1237         fs = force_string(FS_node->var_value);
1238
1239         if (! do_traditional && fs->stlen == 0) {
1240                 static short warned = FALSE;
1241
1242                 parse_field = null_parse_field;
1243
1244                 if (do_lint && ! warned) {
1245                         warned = TRUE;
1246                         lintwarn(_("null string for `FS' is a gawk extension"));
1247                 }
1248         } else if (fs->stlen > 1) {
1249                 if (do_lint_old)
1250                         warning(_("old awk does not support regexps as value of `FS'"));
1251                 parse_field = re_parse_field;
1252         } else if (RS_is_null) {
1253                 /* we know that fs->stlen <= 1 */
1254                 parse_field = sc_parse_field;
1255                 if (fs->stlen == 1) {
1256                         if (fs->stptr[0] == ' ') {
1257                                 default_FS = TRUE;
1258                                 strcpy(buf, "[ \t\n]+");
1259                         } else if (fs->stptr[0] == '\\') {
1260                                 /* yet another special case */
1261                                 strcpy(buf, "[\\\\\n]");
1262                         } else if (fs->stptr[0] != '\n')
1263                                 sprintf(buf, "[%c\n]", fs->stptr[0]);
1264                 }
1265         } else {
1266                 if (do_posix)
1267                         parse_field = posix_def_parse_field;
1268                 else
1269                         parse_field = def_parse_field;
1270
1271                 if (fs->stlen == 1) {
1272                         if (fs->stptr[0] == ' ')
1273                                 default_FS = TRUE;
1274                         else if (fs->stptr[0] == '\\')
1275                                 /* same special case */
1276                                 strcpy(buf, "[\\\\]");
1277                         else
1278                                 parse_field = sc_parse_field;
1279                 }
1280         }
1281         if (remake_re) {
1282                 refree(FS_re_yes_case);
1283                 refree(FS_re_no_case);
1284                 FS_re_yes_case = FS_re_no_case = FS_regexp = NULL;
1285
1286                 if (buf[0] != '\0') {
1287                         FS_re_yes_case = make_regexp(buf, strlen(buf), FALSE, TRUE, TRUE);
1288                         FS_re_no_case = make_regexp(buf, strlen(buf), TRUE, TRUE, TRUE);
1289                         FS_regexp = (IGNORECASE ? FS_re_no_case : FS_re_yes_case);
1290                         parse_field = re_parse_field;
1291                 } else if (parse_field == re_parse_field) {
1292                         FS_re_yes_case = make_regexp(fs->stptr, fs->stlen, FALSE, TRUE, TRUE);
1293                         FS_re_no_case = make_regexp(fs->stptr, fs->stlen, TRUE, TRUE, TRUE);
1294                         FS_regexp = (IGNORECASE ? FS_re_no_case : FS_re_yes_case);
1295                 } else
1296                         FS_re_yes_case = FS_re_no_case = FS_regexp = NULL;
1297         }
1298
1299         /*
1300          * For FS = "c", we don't use IGNORECASE. But we must use
1301          * re_parse_field to get the character and the newline as
1302          * field separators.
1303          */
1304         if (fs->stlen == 1 && parse_field == re_parse_field)
1305                 FS_regexp = FS_re_yes_case;
1306
1307         update_PROCINFO_str("FS", "FS");
1308 }
1309
1310 /* current_field_sep --- return what field separator is */
1311
1312 field_sep_type
1313 current_field_sep()
1314 {
1315         if (parse_field == fw_parse_field)
1316                 return Using_FIELDWIDTHS;
1317         else if (parse_field == fpat_parse_field)
1318                 return Using_FPAT;
1319         else
1320                 return Using_FS;
1321 }
1322
1323 /* update_PROCINFO_str --- update PROCINFO[sub] with string value */
1324
1325 void
1326 update_PROCINFO_str(const char *subscript, const char *str)
1327 {
1328         NODE **aptr;
1329         NODE *tmp;
1330
1331         if (PROCINFO_node == NULL)
1332                 return;
1333         tmp = make_string(subscript, strlen(subscript));
1334         aptr = assoc_lookup(PROCINFO_node, tmp, FALSE);
1335         unref(tmp);
1336         unref(*aptr);
1337         *aptr = make_string(str, strlen(str));
1338 }
1339
1340 /* update_PROCINFO_num --- update PROCINFO[sub] with numeric value */
1341
1342 void
1343 update_PROCINFO_num(const char *subscript, AWKNUM val)
1344 {
1345         NODE **aptr;
1346         NODE *tmp;
1347
1348         if (PROCINFO_node == NULL)
1349                 return;
1350         tmp = make_string(subscript, strlen(subscript));
1351         aptr = assoc_lookup(PROCINFO_node, tmp, FALSE);
1352         unref(tmp);
1353         unref(*aptr);
1354         *aptr = make_number(val);
1355 }
1356
1357 /* set_FPAT --- handle an assignment to FPAT */
1358
1359 void
1360 set_FPAT()
1361 {
1362         static short warned = FALSE;
1363         static NODE *save_fpat = NULL;
1364         int remake_re = TRUE;
1365         NODE *fpat;
1366
1367         if (do_lint && ! warned) {
1368                 warned = TRUE;
1369                 lintwarn(_("`FPAT' is a gawk extension"));
1370         }
1371         if (do_traditional)     /* quick and dirty, does the trick */
1372                 return;
1373
1374         /*
1375          * If changing the way fields are split, obey least-suprise
1376          * semantics, and force $0 to be split totally.
1377          */
1378         if (fields_arr != NULL)
1379                 (void) get_field(UNLIMITED - 1, 0);
1380
1381         /* It's possible that only IGNORECASE changed, or FPAT = FPAT */
1382         /*
1383          * This comparison can't use cmp_nodes(), which pays attention
1384          * to IGNORECASE, and that's not what we want.
1385          */
1386         if (save_fpat
1387                 && FPAT_node->var_value->stlen == save_fpat->stlen
1388                 && memcmp(FPAT_node->var_value->stptr, save_fpat->stptr, save_fpat->stlen) == 0) {
1389                 if (FPAT_regexp != NULL)
1390                         FPAT_regexp = (IGNORECASE ? FPAT_re_no_case : FPAT_re_yes_case);
1391
1392                 /* FPAT = FPAT */
1393                 if (current_field_sep() == Using_FPAT) {
1394                         return;
1395                 } else {
1396                         remake_re = FALSE;
1397                         goto set_fpat_function;
1398                 }
1399         }
1400
1401         unref(save_fpat);
1402         save_fpat = dupnode(FPAT_node->var_value);
1403         refree(FPAT_re_yes_case);
1404         refree(FPAT_re_no_case);
1405         FPAT_re_yes_case = FPAT_re_no_case = FPAT_regexp = NULL;
1406
1407 set_fpat_function:
1408         fpat = force_string(FPAT_node->var_value);
1409         parse_field = fpat_parse_field;
1410
1411         if (remake_re) {
1412                 refree(FPAT_re_yes_case);
1413                 refree(FPAT_re_no_case);
1414                 FPAT_re_yes_case = FPAT_re_no_case = FPAT_regexp = NULL;
1415
1416                 FPAT_re_yes_case = make_regexp(fpat->stptr, fpat->stlen, FALSE, TRUE, TRUE);
1417                 FPAT_re_no_case = make_regexp(fpat->stptr, fpat->stlen, TRUE, TRUE, TRUE);
1418                 FPAT_regexp = (IGNORECASE ? FPAT_re_no_case : FPAT_re_yes_case);
1419         }
1420
1421         update_PROCINFO_str("FS", "FPAT");
1422 }
1423
1424 /*
1425  * increment_scan --- macro to move scan pointer ahead by one character.
1426  *                      Implementation varies if doing MBS or not.
1427  */
1428
1429 #if MBS_SUPPORT
1430 #define increment_scan(scanp, len) incr_scan(scanp, len, & mbs)
1431 #else
1432 #define increment_scan(scanp, len) ((*scanp)++)
1433 #endif
1434
1435 #if MBS_SUPPORT
1436 /* incr_scan --- MBS version of increment_scan() */
1437
1438 static void
1439 incr_scan(char **scanp, size_t len, mbstate_t *mbs)
1440 {
1441         size_t mbclen = 0;
1442
1443         if (gawk_mb_cur_max > 1) {
1444                 mbclen = mbrlen(*scanp, len, mbs);
1445                 if (   (mbclen == 1)
1446                     || (mbclen == (size_t) -1)
1447                     || (mbclen == (size_t) -2)
1448                     || (mbclen == 0)) {
1449                         /* We treat it as a singlebyte character.  */
1450                         mbclen = 1;
1451                 }
1452                 *scanp += mbclen;
1453         } else
1454                 (*scanp)++;
1455 }
1456 #endif
1457
1458 /*
1459  * fpat_parse_field --- parse fields using a regexp.
1460  *
1461  * This is called both from get_field() and from do_patsplit()
1462  * via (*parse_field)().  This variation is for when FPAT is a regular
1463  * expression -- use the value to find field contents.
1464  *
1465  * This was really hard to get right.  It happens to bear many resemblances
1466  * to issues I had with getting gsub right with null matches. When dealing
1467  * with that I prototyped in awk and had the foresight to save the awk code
1468  * over in the C file.  Starting with that as a base, I finally got to this
1469  * awk code to do what I needed, and then translated it into C. Fortunately
1470  * the C code bears a closer correspondance to the awk code here than over
1471  * by gsub.
1472  *
1473  * BEGIN {
1474  *      FALSE = 0
1475  *      TRUE = 1
1476  *
1477  *      fpat[1] = "([^,]*)|(\"[^\"]+\")"
1478  *      fpat[2] = fpat[1]
1479  *      fpat[3] = fpat[1]
1480  *      fpat[4] = "aa+"
1481  *      fpat[5] = fpat[4]
1482  *
1483  *      data[1] = "Robbins,,Arnold,"
1484  *      data[2] = "Smith,,\"1234 A Pretty Place, NE\",Sometown,NY,12345-6789,USA"
1485  *      data[3] = "Robbins,Arnold,\"1234 A Pretty Place, NE\",Sometown,NY,12345-6789,USA"
1486  *      data[4] = "bbbaaacccdddaaaaaqqqq"
1487  *      data[5] = "bbbaaacccdddaaaaaqqqqa" # should get trailing qqqa
1488  *
1489  *      for (i = 1; i in data; i++) {
1490  *              printf("Splitting: <%s>\n", data[i])
1491  *              n = mypatsplit(data[i], fields, fpat[i], seps)
1492  *              print "n =", n
1493  *              for (j = 1; j <= n; j++)
1494  *                      printf("fields[%d] = <%s>\n", j, fields[j])
1495  *              for (j = 0; j in seps; j++)
1496  *                      printf("seps[%s] = <%s>\n", j, seps[j])
1497  *      }
1498  * }
1499  *
1500  * function mypatsplit(string, array, pattern, seps,
1501  *                      eosflag, non_empty, nf) # locals
1502  * {
1503  *      delete array
1504  *      delete seps
1505  *      if (length(string) == 0)
1506  *              return 0
1507  *
1508  *      eosflag = non_empty = FALSE
1509  *      nf = 0
1510  *      while (match(string, pattern)) {
1511  *              if (RLENGTH > 0) {      # easy case
1512  *                      non_empty = TRUE
1513  *                      if (! (nf in seps)) {
1514  *                              if (RSTART == 1)        # match at front of string
1515  *                                      seps[nf] = ""
1516  *                              else
1517  *                                      seps[nf] = substr(string, 1, RSTART - 1)
1518  *                      }
1519  *                      array[++nf] = substr(string, RSTART, RLENGTH)
1520  *                      string = substr(string, RSTART+RLENGTH)
1521  *                      if (length(string) == 0)
1522  *                              break
1523  *              } else if (non_empty) {
1524  *                      # last match was non-empty, and at the
1525  *                      # current character we get a zero length match,
1526  *                      # which we don't want, so skip over it
1527  *                      non_empty = FALSE
1528  *                      seps[nf] = substr(string, 1, 1)
1529  *                      string = substr(string, 2)
1530  *              } else {
1531  *                      # 0 length match
1532  *                      if (! (nf in seps)) {
1533  *                              if (RSTART == 1)
1534  *                                      seps[nf] = ""
1535  *                              else
1536  *                                      seps[nf] = substr(string, 1, RSTART - 1)
1537  *                      }
1538  *                      array[++nf] = ""
1539  *                      if (! non_empty && ! eosflag) { # prev was empty
1540  *                              seps[nf] = substr(string, 1, 1)
1541  *                      }
1542  *                      if (RSTART == 1) {
1543  *                              string = substr(string, 2)
1544  *                      } else {
1545  *                              string = substr(string, RSTART + 1)
1546  *                      }
1547  *                      non_empty = FALSE
1548  *              }
1549  *              if (length(string) == 0) {
1550  *                      if (eosflag)
1551  *                              break
1552  *                      else
1553  *                              eosflag = TRUE
1554  *              }
1555  *      }
1556  *      if (length(string) > 0)
1557  *              seps[nf] = string
1558  *
1559  *      return length(array)
1560  * }
1561  */
1562 static long
1563 fpat_parse_field(long up_to,    /* parse only up to this field number */
1564         char **buf,     /* on input: string to parse; on output: point to start next */
1565         int len,
1566         NODE *fs ATTRIBUTE_UNUSED,
1567         Regexp *rp,
1568         Setfunc set,    /* routine to set the value of the parsed field */
1569         NODE *n,
1570         NODE *sep_arr,  /* array of field separators (may be NULL) */
1571         int in_middle)
1572 {
1573         char *scan = *buf;
1574         long nf = parse_high_water;
1575         char *start;
1576         char *end = scan + len;
1577         int regex_flags = RE_NEED_START;
1578         int need_to_set_sep;
1579         int non_empty;
1580         int eosflag;
1581 #if MBS_SUPPORT
1582         mbstate_t mbs;
1583
1584         if (gawk_mb_cur_max > 1)
1585                 memset(&mbs, 0, sizeof(mbstate_t));
1586 #endif
1587
1588         if (up_to == UNLIMITED)
1589                 nf = 0;
1590
1591         if (len == 0)
1592                 return nf;
1593
1594         if (rp == NULL) /* use FPAT */
1595                 rp = FPAT_regexp;
1596
1597         if (in_middle) {
1598                 regex_flags |= RE_NO_BOL;
1599                 non_empty = rp->non_empty;
1600         } else
1601                 non_empty = FALSE;
1602
1603         eosflag = FALSE;
1604         need_to_set_sep = TRUE;
1605         start = scan;
1606         while (research(rp, scan, 0, (end - scan), regex_flags) != -1
1607                && nf < up_to) {
1608
1609                 if (REEND(rp, scan) > RESTART(rp, scan)) { /* if (RLENGTH > 0) */
1610                         non_empty = TRUE;
1611                         if (sep_arr != NULL && need_to_set_sep) {
1612                                 if (RESTART(rp, scan) == 0) /* match at front */
1613                                         set_element(nf, start, 0L, sep_arr);
1614                                 else
1615                                         set_element(nf,
1616                                                 start,
1617                                                 (long) RESTART(rp, scan),
1618                                                 sep_arr);
1619                         }
1620                         /* field is text that matched */
1621                         (*set)(++nf,
1622                                 scan + RESTART(rp, scan),
1623                                 (long)(REEND(rp, scan) - RESTART(rp, scan)),
1624                                 n);
1625
1626                         scan += REEND(rp, scan);
1627                         if (scan >= end)
1628                                 break;
1629                         need_to_set_sep = TRUE;
1630                 } else if (non_empty) { /* else if non_empty */
1631                         /*
1632                          * last match was non-empty, and at the
1633                          * current character we get a zero length match,
1634                          * which we don't want, so skip over it
1635                          */
1636                         non_empty = FALSE;
1637                         if (sep_arr != NULL) {
1638                                 need_to_set_sep = FALSE;
1639                                 set_element(nf, start, 1L, sep_arr);
1640                         }
1641                         increment_scan(& scan, end - scan);
1642                 } else {
1643                         /* 0 length match */
1644                         if (sep_arr != NULL && need_to_set_sep) {
1645                                 if (RESTART(rp, scan) == 0) /* RSTART == 1 */
1646                                         set_element(nf, start, 0L, sep_arr);
1647                                 else
1648                                         set_element(nf, start,
1649                                                         (long) RESTART(rp, scan),
1650                                                         sep_arr);
1651                         }
1652                         need_to_set_sep = TRUE;
1653                         (*set)(++nf, scan, 0L, n);
1654                         if (! non_empty && ! eosflag) { /* prev was empty */
1655                                 if (sep_arr != NULL) {
1656                                         set_element(nf, start, 1L, sep_arr);
1657                                         need_to_set_sep = FALSE;
1658                                 }
1659                         }
1660                         if (RESTART(rp, scan) == 0)
1661                                 increment_scan(& scan, end - scan);
1662                         else {
1663                                 scan += RESTART(rp, scan);
1664                         }
1665                         non_empty = FALSE;
1666                 }
1667                 if (scan >= end) { /* length(string) == 0 */
1668                         if (eosflag)
1669                                 break;
1670                         else
1671                                 eosflag = TRUE;
1672                 }
1673
1674                 start = scan;
1675         }
1676         if (scan < end) {
1677                 if (sep_arr != NULL)
1678                         set_element(nf, scan, (long) (end - scan), sep_arr);
1679         }
1680
1681         *buf = scan;
1682         rp->non_empty = non_empty;
1683         return nf;
1684 }