field.c

   1 /*
   2  * field.c - routines for dealing with fields and record parsing
   3  */
   4
   5 /*
   6  * Copyright (C) 1986, 1988, 1989, 1991-2014 the Free Software Foundation, Inc.
   7  *
   8  * This file is part of GAWK, the GNU implementation of the
   9  * AWK Programming Language.
  10  *
  11  * GAWK is free software; you can redistribute it and/or modify
  12  * it under the terms of the GNU General Public License as published by
  13  * the Free Software Foundation; either version 3 of the License, or
  14  * (at your option) any later version.
  15  *
  16  * GAWK is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19  * GNU General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU General Public License
  22  * along with this program; if not, write to the Free Software
  23  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
  24  */
  25
  26 #include "awk.h"
  27
  28 /*
  29  * In case that the system doesn't have isblank().
  30  * Don't bother with autoconf ifdef junk, just force it.
  31  * See dfa.c and regex_internal.h and regcomp.c. Bleah.
  32  */
  33 static int
  34 is_blank(int c)
  35 {
  36         return c == ' ' || c == '\t';
  37 }
  38
  39 typedef void (* Setfunc)(long, char *, long, NODE *);
  40
  41 static long (*parse_field)(long, char **, int, NODE *,
  42                              Regexp *, Setfunc, NODE *, NODE *, bool);
  43 static void rebuild_record(void);
  44 static long re_parse_field(long, char **, int, NODE *,
  45                              Regexp *, Setfunc, NODE *, NODE *, bool);
  46 static long def_parse_field(long, char **, int, NODE *,
  47                               Regexp *, Setfunc, NODE *, NODE *, bool);
  48 static long posix_def_parse_field(long, char **, int, NODE *,
  49                               Regexp *, Setfunc, NODE *, NODE *, bool);
  50 static long null_parse_field(long, char **, int, NODE *,
  51                              Regexp *, Setfunc, NODE *, NODE *, bool);
  52 static long sc_parse_field(long, char **, int, NODE *,
  53                              Regexp *, Setfunc, NODE *, NODE *, bool);
  54 static long fw_parse_field(long, char **, int, NODE *,
  55                              Regexp *, Setfunc, NODE *, NODE *, bool);
  56 static long fpat_parse_field(long, char **, int, NODE *,
  57                              Regexp *, Setfunc, NODE *, NODE *, bool);
  58 static void set_element(long num, char * str, long len, NODE *arr);
  59 static void grow_fields_arr(long num);
  60 static void set_field(long num, char *str, long len, NODE *dummy);
  61
  62 static char *parse_extent;      /* marks where to restart parse of record */
  63 static long parse_high_water = 0; /* field number that we have parsed so far */
  64 static long nf_high_water = 0;  /* size of fields_arr */
  65 static bool resave_fs;
  66 static NODE *save_FS;           /* save current value of FS when line is read,
  67                                  * to be used in deferred parsing
  68                                  */
  69 static int *FIELDWIDTHS = NULL;
  70
  71 NODE **fields_arr;              /* array of pointers to the field nodes */
  72 bool field0_valid;              /* $(>0) has not been changed yet */
  73 int default_FS;                 /* true when FS == " " */
  74 Regexp *FS_re_yes_case = NULL;
  75 Regexp *FS_re_no_case = NULL;
  76 Regexp *FS_regexp = NULL;
  77 Regexp *FPAT_re_yes_case = NULL;
  78 Regexp *FPAT_re_no_case = NULL;
  79 Regexp *FPAT_regexp = NULL;
  80 NODE *Null_field = NULL;
  81
  82 /* init_fields --- set up the fields array to start with */
  83
  84 void
  85 init_fields()
  86 {
  87         emalloc(fields_arr, NODE **, sizeof(NODE *), "init_fields");
  88
  89         getnode(fields_arr[0]);
  90         *fields_arr[0] = *Nnull_string;
  91         fields_arr[0]->flags |= NULL_FIELD;
  92
  93         parse_extent = fields_arr[0]->stptr;
  94         save_FS = dupnode(FS_node->var_value);
  95
  96         getnode(Null_field);
  97         *Null_field = *Nnull_string;
  98         Null_field->valref = 1;
  99         Null_field->flags = (FIELD|STRCUR|STRING|NULL_FIELD);
 100
 101         field0_valid = true;
 102 }
 103
 104 /* grow_fields --- acquire new fields as needed */
 105
 106 static void
 107 grow_fields_arr(long num)
 108 {
 109         int t;
 110         NODE *n;
 111
 112         erealloc(fields_arr, NODE **, (num + 1) * sizeof(NODE *), "grow_fields_arr");
 113         for (t = nf_high_water + 1; t <= num; t++) {
 114                 getnode(n);
 115                 *n = *Null_field;
 116                 fields_arr[t] = n;
 117         }
 118         nf_high_water = num;
 119 }
 120
 121 /* set_field --- set the value of a particular field */
 122
 123 /*ARGSUSED*/
 124 static void
 125 set_field(long num,
 126         char *str,
 127         long len,
 128         NODE *dummy ATTRIBUTE_UNUSED)   /* just to make interface same as set_element */
 129 {
 130         NODE *n;
 131
 132         if (num > nf_high_water)
 133                 grow_fields_arr(num);
 134         n = fields_arr[num];
 135         n->stptr = str;
 136         n->stlen = len;
 137         n->flags = (STRCUR|STRING|MAYBE_NUM|FIELD);
 138 }
 139
 140 /* rebuild_record --- Someone assigned a value to $(something).
 141                         Fix up $0 to be right */
 142
 143 static void
 144 rebuild_record()
 145 {
 146         /*
 147          * use explicit unsigned longs for lengths, in case
 148          * a size_t isn't big enough.
 149          */
 150         unsigned long tlen;
 151         unsigned long ofslen;
 152         NODE *tmp;
 153         NODE *ofs;
 154         char *ops;
 155         char *cops;
 156         long i;
 157
 158         assert(NF != -1);
 159
 160         tlen = 0;
 161         ofs = force_string(OFS_node->var_value);
 162         ofslen = ofs->stlen;
 163         for (i = NF; i > 0; i--) {
 164                 tmp = fields_arr[i];
 165                 tmp = force_string(tmp);
 166                 tlen += tmp->stlen;
 167         }
 168         tlen += (NF - 1) * ofslen;
 169         if ((long) tlen < 0)
 170                 tlen = 0;
 171         emalloc(ops, char *, tlen + 2, "rebuild_record");
 172         cops = ops;
 173         ops[0] = '\0';
 174         for (i = 1;  i <= NF; i++) {
 175                 free_wstr(fields_arr[i]);
 176                 tmp = fields_arr[i];
 177                 /* copy field */
 178                 if (tmp->stlen == 1)
 179                         *cops++ = tmp->stptr[0];
 180                 else if (tmp->stlen != 0) {
 181                         memcpy(cops, tmp->stptr, tmp->stlen);
 182                         cops += tmp->stlen;
 183                 }
 184                 /* copy OFS */
 185                 if (i != NF) {
 186                         if (ofslen == 1)
 187                                 *cops++ = ofs->stptr[0];
 188                         else if (ofslen != 0) {
 189                                 memcpy(cops, ofs->stptr, ofslen);
 190                                 cops += ofslen;
 191                         }
 192                 }
 193         }
 194         tmp = make_str_node(ops, tlen, ALREADY_MALLOCED);
 195
 196         /*
 197          * Since we are about to unref fields_arr[0], we want to find
 198          * any fields that still point into it, and have them point
 199          * into the new field zero.  This has to be done intelligently,
 200          * so that unrefing a field doesn't try to unref into the old $0.
 201          */
 202         for (cops = ops, i = 1; i <= NF; i++) {
 203                 NODE *r = fields_arr[i];
 204                 if (r->stlen > 0) {
 205                         NODE *n;
 206                         getnode(n);
 207
 208                         if ((r->flags & FIELD) == 0) {
 209                                 *n = *Null_field;
 210                                 n->stlen = r->stlen;
 211                                 if ((r->flags & (NUMCUR|NUMBER)) != 0) {
 212                                         n->flags |= (r->flags & (MPFN|MPZN|NUMCUR|NUMBER));
 213 #ifdef HAVE_MPFR
 214                                         if (is_mpg_float(r)) {
 215                                                 mpfr_init(n->mpg_numbr);
 216                                                 mpfr_set(n->mpg_numbr, r->mpg_numbr, ROUND_MODE);
 217                                         } else if (is_mpg_integer(r)) {
 218                                                 mpz_init(n->mpg_i);
 219                                                 mpz_set(n->mpg_i, r->mpg_i);
 220                                         } else
 221 #endif
 222                                         n->numbr = r->numbr;
 223                                 }
 224                         } else {
 225                                 *n = *r;
 226                                 n->flags &= ~(MALLOC|STRING);
 227                         }
 228
 229                         n->stptr = cops;
 230                         unref(r);
 231                         fields_arr[i] = n;
 232                         assert((n->flags & WSTRCUR) == 0);
 233                 }
 234                 cops += fields_arr[i]->stlen + ofslen;
 235         }
 236
 237         unref(fields_arr[0]);
 238
 239         fields_arr[0] = tmp;
 240         field0_valid = true;
 241 }
 242
 243 /*
 244  * set_record:
 245  * setup $0, but defer parsing rest of line until reference is made to $(>0)
 246  * or to NF.  At that point, parse only as much as necessary.
 247  *
 248  * Manage a private buffer for the contents of $0.  Doing so keeps us safe
 249  * if `getline var' decides to rearrange the contents of the IOBUF that
 250  * $0 might have been pointing into.  The cost is the copying of the buffer;
 251  * but better correct than fast.
 252  */
 253 void
 254 set_record(const char *buf, int cnt)
 255 {
 256         NODE *n;
 257         static char *databuf;
 258         static unsigned long databuf_size;
 259 #define INITIAL_SIZE    512
 260 #define MAX_SIZE        ((unsigned long) ~0)    /* maximally portable ... */
 261
 262         reset_record();
 263
 264         /* buffer management: */
 265         if (databuf_size == 0) {        /* first time */
 266                 emalloc(databuf, char *, INITIAL_SIZE, "set_record");
 267                 databuf_size = INITIAL_SIZE;
 268                 memset(databuf, '\0', INITIAL_SIZE);
 269
 270         }
 271         /*
 272          * Make sure there's enough room. Since we sometimes need
 273          * to place a sentinel at the end, we make sure
 274          * databuf_size is > cnt after allocation.
 275          */
 276         if (cnt >= databuf_size) {
 277                 while (cnt >= databuf_size && databuf_size <= MAX_SIZE)
 278                         databuf_size *= 2;
 279                 erealloc(databuf, char *, databuf_size, "set_record");
 280                 memset(databuf, '\0', databuf_size);
 281         }
 282         /* copy the data */
 283         memcpy(databuf, buf, cnt);
 284
 285         /* manage field 0: */
 286         unref(fields_arr[0]);
 287         getnode(n);
 288         n->stptr = databuf;
 289         n->stlen = cnt;
 290         n->valref = 1;
 291         n->type = Node_val;
 292         n->stfmt = -1;
 293         n->flags = (STRING|STRCUR|MAYBE_NUM|FIELD);
 294         fields_arr[0] = n;
 295
 296 #undef INITIAL_SIZE
 297 #undef MAX_SIZE
 298 }
 299
 300 /* reset_record --- start over again with current $0 */
 301
 302 void
 303 reset_record()
 304 {
 305         int i;
 306         NODE *n;
 307
 308         fields_arr[0] = force_string(fields_arr[0]);
 309
 310         NF = -1;
 311         for (i = 1; i <= parse_high_water; i++) {
 312                 unref(fields_arr[i]);
 313                 getnode(n);
 314                 *n = *Null_field;
 315                 fields_arr[i] = n;
 316         }
 317
 318         parse_high_water = 0;
 319         /*
 320          * $0 = $0 should resplit using the current value of FS.
 321          */
 322         if (resave_fs) {
 323                 resave_fs = false;
 324                 unref(save_FS);
 325                 save_FS = dupnode(FS_node->var_value);
 326         }
 327
 328         field0_valid = true;
 329 }
 330
 331 /* set_NF --- handle what happens to $0 and fields when NF is changed */
 332
 333 void
 334 set_NF()
 335 {
 336         int i;
 337         long nf;
 338         NODE *n;
 339
 340         assert(NF != -1);
 341
 342         (void) force_number(NF_node->var_value);
 343         nf = get_number_si(NF_node->var_value);
 344         if (nf < 0)
 345                 fatal(_("NF set to negative value"));
 346         NF = nf;
 347
 348         if (NF > nf_high_water)
 349                 grow_fields_arr(NF);
 350         if (parse_high_water < NF) {
 351                 for (i = parse_high_water + 1; i >= 0 && i <= NF; i++) {
 352                         unref(fields_arr[i]);
 353                         getnode(n);
 354                         *n = *Null_field;
 355                         fields_arr[i] = n;
 356                 }
 357                 parse_high_water = NF;
 358         } else if (parse_high_water > 0) {
 359                 for (i = NF + 1; i >= 0 && i <= parse_high_water; i++) {
 360                         unref(fields_arr[i]);
 361                         getnode(n);
 362                         *n = *Null_field;
 363                         fields_arr[i] = n;
 364                 }
 365                 parse_high_water = NF;
 366         }
 367         field0_valid = false;
 368 }
 369
 370 /*
 371  * re_parse_field --- parse fields using a regexp.
 372  *
 373  * This is called both from get_field() and from do_split()
 374  * via (*parse_field)().  This variation is for when FS is a regular
 375  * expression -- either user-defined or because RS=="" and FS==" "
 376  */
 377 static long
 378 re_parse_field(long up_to,      /* parse only up to this field number */
 379         char **buf,     /* on input: string to parse; on output: point to start next */
 380         int len,
 381         NODE *fs ATTRIBUTE_UNUSED,
 382         Regexp *rp,
 383         Setfunc set,    /* routine to set the value of the parsed field */
 384         NODE *n,
 385         NODE *sep_arr,  /* array of field separators (maybe NULL) */
 386         bool in_middle)
 387 {
 388         char *scan = *buf;
 389         long nf = parse_high_water;
 390         char *field;
 391         char *end = scan + len;
 392         int regex_flags = RE_NEED_START;
 393         char *sep;
 394 #if MBS_SUPPORT
 395         size_t mbclen = 0;
 396         mbstate_t mbs;
 397         if (gawk_mb_cur_max > 1)
 398                 memset(&mbs, 0, sizeof(mbstate_t));
 399 #endif
 400
 401         if (in_middle)
 402                 regex_flags |= RE_NO_BOL;
 403
 404         if (up_to == UNLIMITED)
 405                 nf = 0;
 406         if (len == 0)
 407                 return nf;
 408
 409         if (RS_is_null && default_FS) {
 410                 sep = scan;
 411                 while (scan < end && (*scan == ' ' || *scan == '\t' || *scan == '\n'))
 412                         scan++;
 413                 if (sep_arr != NULL && sep < scan)
 414                         set_element(nf, sep, (long)(scan - sep), sep_arr);
 415         }
 416
 417         if (rp == NULL) /* use FS */
 418                 rp = FS_regexp;
 419
 420         field = scan;
 421         while (scan < end
 422                && research(rp, scan, 0, (end - scan), regex_flags) != -1
 423                && nf < up_to) {
 424                 regex_flags |= RE_NO_BOL;
 425                 if (REEND(rp, scan) == RESTART(rp, scan)) {   /* null match */
 426 #if MBS_SUPPORT
 427                         if (gawk_mb_cur_max > 1)        {
 428                                 mbclen = mbrlen(scan, end-scan, &mbs);
 429                                 if ((mbclen == 1) || (mbclen == (size_t) -1)
 430                                         || (mbclen == (size_t) -2) || (mbclen == 0)) {
 431                                         /* We treat it as a singlebyte character.  */
 432                                         mbclen = 1;
 433                                 }
 434                                 scan += mbclen;
 435                         } else
 436 #endif
 437                         scan++;
 438                         if (scan == end) {
 439                                 (*set)(++nf, field, (long)(scan - field), n);
 440                                 up_to = nf;
 441                                 break;
 442                         }
 443                         continue;
 444                 }
 445                 (*set)(++nf, field,
 446                        (long)(scan + RESTART(rp, scan) - field), n);
 447                 if (sep_arr != NULL)
 448                         set_element(nf, scan + RESTART(rp, scan),
 449                                 (long) (REEND(rp, scan) - RESTART(rp, scan)), sep_arr);
 450                 scan += REEND(rp, scan);
 451                 field = scan;
 452                 if (scan == end)        /* FS at end of record */
 453                         (*set)(++nf, field, 0L, n);
 454         }
 455         if (nf != up_to && scan < end) {
 456                 (*set)(++nf, scan, (long)(end - scan), n);
 457                 scan = end;
 458         }
 459         *buf = scan;
 460         return nf;
 461 }
 462
 463 /*
 464  * def_parse_field --- default field parsing.
 465  *
 466  * This is called both from get_field() and from do_split()
 467  * via (*parse_field)().  This variation is for when FS is a single space
 468  * character.
 469  */
 470
 471 static long
 472 def_parse_field(long up_to,     /* parse only up to this field number */
 473         char **buf,     /* on input: string to parse; on output: point to start next */
 474         int len,
 475         NODE *fs,
 476         Regexp *rp ATTRIBUTE_UNUSED,
 477         Setfunc set,    /* routine to set the value of the parsed field */
 478         NODE *n,
 479         NODE *sep_arr,  /* array of field separators (maybe NULL) */
 480         bool in_middle ATTRIBUTE_UNUSED)
 481 {
 482         char *scan = *buf;
 483         long nf = parse_high_water;
 484         char *field;
 485         char *end = scan + len;
 486         char sav;
 487         char *sep;
 488
 489         if (up_to == UNLIMITED)
 490                 nf = 0;
 491         if (len == 0)
 492                 return nf;
 493
 494         /*
 495          * Nasty special case. If FS set to "", return whole record
 496          * as first field. This is not worth a separate function.
 497          */
 498         if (fs->stlen == 0) {
 499                 (*set)(++nf, *buf, len, n);
 500                 *buf += len;
 501                 return nf;
 502         }
 503
 504         /* before doing anything save the char at *end */
 505         sav = *end;
 506         /* because it will be destroyed now: */
 507
 508         *end = ' ';     /* sentinel character */
 509         sep = scan;
 510         for (; nf < up_to; scan++) {
 511                 /*
 512                  * special case:  fs is single space, strip leading whitespace
 513                  */
 514                 while (scan < end && (*scan == ' ' || *scan == '\t' || *scan == '\n'))
 515                         scan++;
 516
 517                 if (sep_arr != NULL && scan > sep)
 518                         set_element(nf, sep, (long) (scan - sep), sep_arr);
 519
 520                 if (scan >= end)
 521                         break;
 522
 523                 field = scan;
 524
 525                 while (*scan != ' ' && *scan != '\t' && *scan != '\n')
 526                         scan++;
 527
 528                 (*set)(++nf, field, (long)(scan - field), n);
 529
 530                 if (scan == end)
 531                         break;
 532
 533                 sep = scan;
 534         }
 535
 536         /* everything done, restore original char at *end */
 537         *end = sav;
 538
 539         *buf = scan;
 540         return nf;
 541 }
 542
 543 /*
 544  * posix_def_parse_field --- default field parsing.
 545  *
 546  * This is called both from get_field() and from do_split()
 547  * via (*parse_field)().  This variation is for when FS is a single space
 548  * character.  The only difference between this and def_parse_field()
 549  * is that this one does not allow newlines to separate fields.
 550  */
 551
 552 static long
 553 posix_def_parse_field(long up_to,       /* parse only up to this field number */
 554         char **buf,     /* on input: string to parse; on output: point to start next */
 555         int len,
 556         NODE *fs,
 557         Regexp *rp ATTRIBUTE_UNUSED,
 558         Setfunc set,    /* routine to set the value of the parsed field */
 559         NODE *n,
 560         NODE *dummy ATTRIBUTE_UNUSED, /* sep_arr not needed here: hence dummy */
 561         bool in_middle ATTRIBUTE_UNUSED)
 562 {
 563         char *scan = *buf;
 564         long nf = parse_high_water;
 565         char *field;
 566         char *end = scan + len;
 567         char sav;
 568
 569         if (up_to == UNLIMITED)
 570                 nf = 0;
 571         if (len == 0)
 572                 return nf;
 573
 574         /*
 575          * Nasty special case. If FS set to "", return whole record
 576          * as first field. This is not worth a separate function.
 577          */
 578         if (fs->stlen == 0) {
 579                 (*set)(++nf, *buf, len, n);
 580                 *buf += len;
 581                 return nf;
 582         }
 583
 584         /* before doing anything save the char at *end */
 585         sav = *end;
 586         /* because it will be destroyed now: */
 587
 588         *end = ' ';     /* sentinel character */
 589         for (; nf < up_to; scan++) {
 590                 /*
 591                  * special case:  fs is single space, strip leading whitespace
 592                  */
 593                 while (scan < end && (*scan == ' ' || *scan == '\t'))
 594                         scan++;
 595                 if (scan >= end)
 596                         break;
 597                 field = scan;
 598                 while (*scan != ' ' && *scan != '\t')
 599                         scan++;
 600                 (*set)(++nf, field, (long)(scan - field), n);
 601                 if (scan == end)
 602                         break;
 603         }
 604
 605         /* everything done, restore original char at *end */
 606         *end = sav;
 607
 608         *buf = scan;
 609         return nf;
 610 }
 611
 612 /*
 613  * null_parse_field --- each character is a separate field
 614  *
 615  * This is called both from get_field() and from do_split()
 616  * via (*parse_field)().  This variation is for when FS is the null string.
 617  */
 618 static long
 619 null_parse_field(long up_to,    /* parse only up to this field number */
 620         char **buf,     /* on input: string to parse; on output: point to start next */
 621         int len,
 622         NODE *fs ATTRIBUTE_UNUSED,
 623         Regexp *rp ATTRIBUTE_UNUSED,
 624         Setfunc set,    /* routine to set the value of the parsed field */
 625         NODE *n,
 626         NODE *sep_arr,  /* array of field separators (maybe NULL) */
 627         bool in_middle ATTRIBUTE_UNUSED)
 628 {
 629         char *scan = *buf;
 630         long nf = parse_high_water;
 631         char *end = scan + len;
 632
 633         if (up_to == UNLIMITED)
 634                 nf = 0;
 635         if (len == 0)
 636                 return nf;
 637
 638 #if MBS_SUPPORT
 639         if (gawk_mb_cur_max > 1) {
 640                 mbstate_t mbs;
 641                 memset(&mbs, 0, sizeof(mbstate_t));
 642                 for (; nf < up_to && scan < end;) {
 643                         size_t mbclen = mbrlen(scan, end-scan, &mbs);
 644                         if ((mbclen == 1) || (mbclen == (size_t) -1)
 645                                 || (mbclen == (size_t) -2) || (mbclen == 0)) {
 646                                 /* We treat it as a singlebyte character.  */
 647                                 mbclen = 1;
 648                         }
 649                         if (sep_arr != NULL && nf > 0)
 650                                 set_element(nf, scan, 0L, sep_arr);
 651                         (*set)(++nf, scan, mbclen, n);
 652                         scan += mbclen;
 653                 }
 654         } else
 655 #endif
 656         for (; nf < up_to && scan < end; scan++) {
 657                 if (sep_arr != NULL && nf > 0)
 658                         set_element(nf, scan, 0L, sep_arr);
 659                 (*set)(++nf, scan, 1L, n);
 660         }
 661
 662         *buf = scan;
 663         return nf;
 664 }
 665
 666 /*
 667  * sc_parse_field --- single character field separator
 668  *
 669  * This is called both from get_field() and from do_split()
 670  * via (*parse_field)().  This variation is for when FS is a single character
 671  * other than space.
 672  */
 673 static long
 674 sc_parse_field(long up_to,      /* parse only up to this field number */
 675         char **buf,     /* on input: string to parse; on output: point to start next */
 676         int len,
 677         NODE *fs,
 678         Regexp *rp ATTRIBUTE_UNUSED,
 679         Setfunc set,    /* routine to set the value of the parsed field */
 680         NODE *n,
 681         NODE *sep_arr,  /* array of field separators (maybe NULL) */
 682         bool in_middle ATTRIBUTE_UNUSED)
 683 {
 684         char *scan = *buf;
 685         char fschar;
 686         long nf = parse_high_water;
 687         char *field;
 688         char *end = scan + len;
 689         char sav;
 690 #if MBS_SUPPORT
 691         size_t mbclen = 0;
 692         mbstate_t mbs;
 693         if (gawk_mb_cur_max > 1)
 694                 memset(&mbs, 0, sizeof(mbstate_t));
 695 #endif
 696
 697         if (up_to == UNLIMITED)
 698                 nf = 0;
 699         if (len == 0)
 700                 return nf;
 701
 702         if (RS_is_null && fs->stlen == 0)
 703                 fschar = '\n';
 704         else
 705                 fschar = fs->stptr[0];
 706
 707         /* before doing anything save the char at *end */
 708         sav = *end;
 709         /* because it will be destroyed now: */
 710         *end = fschar;  /* sentinel character */
 711
 712         for (; nf < up_to;) {
 713                 field = scan;
 714 #if MBS_SUPPORT
 715                 if (gawk_mb_cur_max > 1) {
 716                         while (*scan != fschar) {
 717                                 mbclen = mbrlen(scan, end-scan, &mbs);
 718                                 if ((mbclen == 1) || (mbclen == (size_t) -1)
 719                                         || (mbclen == (size_t) -2) || (mbclen == 0)) {
 720                                         /* We treat it as a singlebyte character.  */
 721                                         mbclen = 1;
 722                                 }
 723                                 scan += mbclen;
 724                         }
 725                 } else
 726 #endif
 727                 while (*scan != fschar)
 728                         scan++;
 729                 (*set)(++nf, field, (long)(scan - field), n);
 730                 if (scan == end)
 731                         break;
 732                 if (sep_arr != NULL)
 733                         set_element(nf, scan, 1L, sep_arr);
 734                 scan++;
 735                 if (scan == end) {      /* FS at end of record */
 736                         (*set)(++nf, field, 0L, n);
 737                         break;
 738                 }
 739         }
 740
 741         /* everything done, restore original char at *end */
 742         *end = sav;
 743
 744         *buf = scan;
 745         return nf;
 746 }
 747
 748 /*
 749  * fw_parse_field --- field parsing using FIELDWIDTHS spec
 750  *
 751  * This is called from get_field() via (*parse_field)().
 752  * This variation is for fields are fixed widths.
 753  */
 754 static long
 755 fw_parse_field(long up_to,      /* parse only up to this field number */
 756         char **buf,     /* on input: string to parse; on output: point to start next */
 757         int len,
 758         NODE *fs ATTRIBUTE_UNUSED,
 759         Regexp *rp ATTRIBUTE_UNUSED,
 760         Setfunc set,    /* routine to set the value of the parsed field */
 761         NODE *n,
 762         NODE *dummy ATTRIBUTE_UNUSED, /* sep_arr not needed here: hence dummy */
 763         bool in_middle ATTRIBUTE_UNUSED)
 764 {
 765         char *scan = *buf;
 766         long nf = parse_high_water;
 767         char *end = scan + len;
 768 #if MBS_SUPPORT
 769         int nmbc;
 770         size_t mbclen;
 771         size_t mbslen;
 772         size_t lenrest;
 773         char *mbscan;
 774         mbstate_t mbs;
 775
 776         memset(&mbs, 0, sizeof(mbstate_t));
 777 #endif
 778
 779         if (up_to == UNLIMITED)
 780                 nf = 0;
 781         if (len == 0)
 782                 return nf;
 783         for (; nf < up_to && (len = FIELDWIDTHS[nf+1]) != -1; ) {
 784 #if MBS_SUPPORT
 785                 if (gawk_mb_cur_max > 1) {
 786                         nmbc = 0;
 787                         mbslen = 0;
 788                         mbscan = scan;
 789                         lenrest = end - scan;
 790                         while (nmbc < len && mbslen < lenrest) {
 791                                 mbclen = mbrlen(mbscan, end - mbscan, &mbs);
 792                                 if (   mbclen == 1
 793                                     || mbclen == (size_t) -1
 794                                     || mbclen == (size_t) -2
 795                                     || mbclen == 0) {
 796                                         /* We treat it as a singlebyte character.  */
 797                                         mbclen = 1;
 798                                 }
 799                                 if (mbclen <= end - mbscan) {
 800                                         mbscan += mbclen;
 801                                         mbslen += mbclen;
 802                                         ++nmbc;
 803                                 }
 804                         }
 805                         (*set)(++nf, scan, (long) mbslen, n);
 806                         scan += mbslen;
 807                 }
 808                 else
 809 #endif
 810                 {
 811                         if (len > end - scan)
 812                                 len = end - scan;
 813                         (*set)(++nf, scan, (long) len, n);
 814                         scan += len;
 815                 }
 816         }
 817         if (len == -1)
 818                 *buf = end;
 819         else
 820                 *buf = scan;
 821         return nf;
 822 }
 823
 824 /* invalidate_field0 --- $0 needs reconstruction */
 825
 826 void
 827 invalidate_field0()
 828 {
 829         field0_valid = false;
 830 }
 831
 832 /* get_field --- return a particular $n */
 833
 834 /* assign is not NULL if this field is on the LHS of an assign */
 835
 836 NODE **
 837 get_field(long requested, Func_ptr *assign)
 838 {
 839         bool in_middle = false;
 840         /*
 841          * if requesting whole line but some other field has been altered,
 842          * then the whole line must be rebuilt
 843          */
 844         if (requested == 0) {
 845                 if (! field0_valid) {
 846                         /* first, parse remainder of input record */
 847                         if (NF == -1) {
 848                                 NF = (*parse_field)(UNLIMITED - 1, &parse_extent,
 849                                         fields_arr[0]->stlen -
 850                                         (parse_extent - fields_arr[0]->stptr),
 851                                         save_FS, FS_regexp, set_field,
 852                                         (NODE *) NULL,
 853                                         (NODE *) NULL,
 854                                         in_middle);
 855                                 parse_high_water = NF;
 856                         }
 857                         rebuild_record();
 858                 }
 859                 if (assign != NULL)
 860                         *assign = reset_record;
 861                 return &fields_arr[0];
 862         }
 863
 864         /* assert(requested > 0); */
 865
 866 #if 0
 867         if (assign != NULL)
 868                 field0_valid = false;           /* $0 needs reconstruction */
 869 #else
 870         /*
 871          * Keep things uniform. Also, mere intention of assigning something
 872          * to $n should not make $0 invalid. Makes sense to invalidate $0
 873          * after the actual assignment is performed. Not a real issue in
 874          * the interpreter otherwise, but causes problem in the
 875          * debugger when watching or printing fields.
 876          */
 877
 878         if (assign != NULL)
 879                 *assign = invalidate_field0;    /* $0 needs reconstruction */
 880 #endif
 881
 882         if (requested <= parse_high_water)      /* already parsed this field */
 883                 return &fields_arr[requested];
 884
 885         if (NF == -1) { /* have not yet parsed to end of record */
 886                 /*
 887                  * parse up to requested fields, calling set_field() for each,
 888                  * saving in parse_extent the point where the parse left off
 889                  */
 890                 if (parse_high_water == 0)      /* starting at the beginning */
 891                         parse_extent = fields_arr[0]->stptr;
 892                 else
 893                         in_middle = true;
 894                 parse_high_water = (*parse_field)(requested, &parse_extent,
 895                      fields_arr[0]->stlen - (parse_extent - fields_arr[0]->stptr),
 896                      save_FS, NULL, set_field, (NODE *) NULL, (NODE *) NULL, in_middle);
 897
 898                 /*
 899                  * if we reached the end of the record, set NF to the number of
 900                  * fields so far.  Note that requested might actually refer to
 901                  * a field that is beyond the end of the record, but we won't
 902                  * set NF to that value at this point, since this is only a
 903                  * reference to the field and NF only gets set if the field
 904                  * is assigned to -- this case is handled below
 905                  */
 906                 if (parse_extent == fields_arr[0]->stptr + fields_arr[0]->stlen)
 907                         NF = parse_high_water;
 908                 else if (parse_field == fpat_parse_field) {
 909                         /* FPAT parsing is wierd, isolate the special cases */
 910                         char *rec_start = fields_arr[0]->stptr;
 911                         char *rec_end = fields_arr[0]->stptr + fields_arr[0]->stlen;
 912
 913                         if (    parse_extent > rec_end
 914                             || (parse_extent > rec_start && parse_extent < rec_end && requested == UNLIMITED-1))
 915                                 NF = parse_high_water;
 916                         else if (parse_extent == rec_start) /* could be no match for FPAT */
 917                                 NF = 0;
 918                 }
 919                 if (requested == UNLIMITED - 1) /* UNLIMITED-1 means set NF */
 920                         requested = parse_high_water;
 921         }
 922         if (parse_high_water < requested) { /* requested beyond end of record */
 923                 if (assign != NULL) {   /* expand record */
 924                         if (requested > nf_high_water)
 925                                 grow_fields_arr(requested);
 926
 927                         NF = requested;
 928                         parse_high_water = requested;
 929                 } else
 930                         return &Null_field;
 931         }
 932
 933         return &fields_arr[requested];
 934 }
 935
 936 /* set_element --- set an array element, used by do_split() */
 937
 938 static void
 939 set_element(long num, char *s, long len, NODE *n)
 940 {
 941         NODE *it;
 942         NODE **lhs;
 943         NODE *sub;
 944
 945         it = make_string(s, len);
 946         it->flags |= MAYBE_NUM;
 947         sub = make_number((AWKNUM) (num));
 948         lhs = assoc_lookup(n, sub);
 949         unref(*lhs);
 950         *lhs = it;
 951         if (n->astore != NULL)
 952                 (*n->astore)(n, sub);
 953         unref(sub);
 954 }
 955
 956 /* do_split --- implement split(), semantics are same as for field splitting */
 957
 958 NODE *
 959 do_split(int nargs)
 960 {
 961         NODE *src, *arr, *sep, *fs, *tmp, *sep_arr = NULL;
 962         char *s;
 963         long (*parseit)(long, char **, int, NODE *,
 964                          Regexp *, Setfunc, NODE *, NODE *, bool);
 965         Regexp *rp = NULL;
 966
 967         if (nargs == 4) {
 968                 static bool warned1 = false, warned2 = false;
 969
 970                 if (do_traditional || do_posix) {
 971                         fatal(_("split: fourth argument is a gawk extension"));
 972                 }
 973                 sep_arr = POP_PARAM();
 974                 if (sep_arr->type != Node_var_array)
 975                         fatal(_("split: fourth argument is not an array"));
 976                 if (do_lint && ! warned1) {
 977                         warned1 = true;
 978                         lintwarn(_("split: fourth argument is a gawk extension"));
 979                 }
 980                 if (do_lint_old && ! warned2) {
 981                         warned2 = true;
 982                         warning(_("split: fourth argument is a gawk extension"));
 983                 }
 984         }
 985
 986         sep = POP();
 987         arr = POP_PARAM();
 988         if (arr->type != Node_var_array)
 989                 fatal(_("split: second argument is not an array"));
 990
 991         if (sep_arr != NULL) {
 992                 if (sep_arr == arr)
 993                         fatal(_("split: cannot use the same array for second and fourth args"));
 994
 995                 /* This checks need to be done before clearing any of the arrays */
 996                 for (tmp = sep_arr->parent_array; tmp != NULL; tmp = tmp->parent_array)
 997                         if (tmp == arr)
 998                                 fatal(_("split: cannot use a subarray of second arg for fourth arg"));
 999                 for (tmp = arr->parent_array; tmp != NULL; tmp = tmp->parent_array)
1000                         if (tmp == sep_arr)
1001                                 fatal(_("split: cannot use a subarray of fourth arg for second arg"));
1002                 assoc_clear(sep_arr);
1003         }
1004         assoc_clear(arr);
1005
1006         src = TOP_STRING();
1007         if (src->stlen == 0) {
1008                 /*
1009                  * Skip the work if first arg is the null string.
1010                  */
1011                 tmp = POP_SCALAR();
1012                 DEREF(tmp);
1013                 return make_number((AWKNUM) 0);
1014         }
1015
1016         if (   (sep->re_flags & FS_DFLT) != 0
1017             && current_field_sep() == Using_FS
1018             && ! RS_is_null) {
1019                 parseit = parse_field;
1020                 fs = force_string(FS_node->var_value);
1021                 rp = FS_regexp;
1022         } else {
1023                 fs = sep->re_exp;
1024
1025                 if (fs->stlen == 0) {
1026                         static bool warned = false;
1027
1028                         parseit = null_parse_field;
1029
1030                         if (do_lint && ! warned) {
1031                                 warned = true;
1032                                 lintwarn(_("split: null string for third arg is a gawk extension"));
1033                         }
1034                 } else if (fs->stlen == 1 && (sep->re_flags & CONSTANT) == 0) {
1035                         if (fs->stptr[0] == ' ') {
1036                                 if (do_posix)
1037                                         parseit = posix_def_parse_field;
1038                                 else
1039                                         parseit = def_parse_field;
1040                         } else
1041                                 parseit = sc_parse_field;
1042                 } else {
1043                         parseit = re_parse_field;
1044                         rp = re_update(sep);
1045                 }
1046         }
1047
1048         s = src->stptr;
1049         tmp = make_number((AWKNUM) (*parseit)(UNLIMITED, &s, (int) src->stlen,
1050                                              fs, rp, set_element, arr, sep_arr, false));
1051
1052         src = POP_SCALAR();     /* really pop off stack */
1053         DEREF(src);
1054         return tmp;
1055 }
1056
1057 /*
1058  * do_patsplit --- implement patsplit(), semantics are same as for field
1059  *                 splitting with FPAT.
1060  */
1061
1062 NODE *
1063 do_patsplit(int nargs)
1064 {
1065         NODE *src, *arr, *sep, *fpat, *tmp, *sep_arr = NULL;
1066         char *s;
1067         Regexp *rp = NULL;
1068
1069         if (nargs == 4) {
1070                 sep_arr = POP_PARAM();
1071                 if (sep_arr->type != Node_var_array)
1072                         fatal(_("patsplit: fourth argument is not an array"));
1073         }
1074         sep = POP();
1075         arr = POP_PARAM();
1076         if (arr->type != Node_var_array)
1077                 fatal(_("patsplit: second argument is not an array"));
1078
1079         src = TOP_STRING();
1080
1081         fpat = sep->re_exp;
1082         if (fpat->stlen == 0)
1083                 fatal(_("patsplit: third argument must be non-null"));
1084
1085         if (sep_arr != NULL) {
1086                 if (sep_arr == arr)
1087                         fatal(_("patsplit: cannot use the same array for second and fourth args"));
1088
1089                 /* These checks need to be done before clearing any of the arrays */
1090                 for (tmp = sep_arr->parent_array; tmp != NULL; tmp = tmp->parent_array)
1091                         if (tmp == arr)
1092                                 fatal(_("patsplit: cannot use a subarray of second arg for fourth arg"));
1093                 for (tmp = arr->parent_array; tmp != NULL; tmp = tmp->parent_array)
1094                         if (tmp == sep_arr)
1095                                 fatal(_("patsplit: cannot use a subarray of fourth arg for second arg"));
1096                 assoc_clear(sep_arr);
1097         }
1098         assoc_clear(arr);
1099
1100         if (src->stlen == 0) {
1101                 /*
1102                  * Skip the work if first arg is the null string.
1103                  */
1104                 tmp =  make_number((AWKNUM) 0);
1105         } else {
1106                 rp = re_update(sep);
1107                 s = src->stptr;
1108                 tmp = make_number((AWKNUM) fpat_parse_field(UNLIMITED, &s,
1109                                 (int) src->stlen, fpat, rp,
1110                                 set_element, arr, sep_arr, false));
1111         }
1112
1113         src = POP_SCALAR();     /* really pop off stack */
1114         DEREF(src);
1115         return tmp;
1116 }
1117
1118 /* set_FIELDWIDTHS --- handle an assignment to FIELDWIDTHS */
1119
1120 void
1121 set_FIELDWIDTHS()
1122 {
1123         char *scan;
1124         char *end;
1125         int i;
1126         static int fw_alloc = 4;
1127         static bool warned = false;
1128         bool fatal_error = false;
1129         NODE *tmp;
1130
1131         if (do_lint && ! warned) {
1132                 warned = true;
1133                 lintwarn(_("`FIELDWIDTHS' is a gawk extension"));
1134         }
1135         if (do_traditional)     /* quick and dirty, does the trick */
1136                 return;
1137
1138         /*
1139          * If changing the way fields are split, obey least-suprise
1140          * semantics, and force $0 to be split totally.
1141          */
1142         if (fields_arr != NULL)
1143                 (void) get_field(UNLIMITED - 1, 0);
1144
1145         parse_field = fw_parse_field;
1146         tmp = force_string(FIELDWIDTHS_node->var_value);
1147         scan = tmp->stptr;
1148
1149         if (FIELDWIDTHS == NULL)
1150                 emalloc(FIELDWIDTHS, int *, fw_alloc * sizeof(int), "set_FIELDWIDTHS");
1151         FIELDWIDTHS[0] = 0;
1152         for (i = 1; ; i++) {
1153                 unsigned long int tmp;
1154                 if (i + 2 >= fw_alloc) {
1155                         fw_alloc *= 2;
1156                         erealloc(FIELDWIDTHS, int *, fw_alloc * sizeof(int), "set_FIELDWIDTHS");
1157                 }
1158                 /* Initialize value to be end of list */
1159                 FIELDWIDTHS[i] = -1;
1160                 /* Ensure that there is no leading `-' sign.  Otherwise,
1161                    strtoul would accept it and return a bogus result.  */
1162                 while (is_blank(*scan)) {
1163                         ++scan;
1164                 }
1165                 if (*scan == '-') {
1166                         fatal_error = true;
1167                         break;
1168                 }
1169                 if (*scan == '\0')
1170                         break;
1171
1172                 /* Detect an invalid base-10 integer, a valid value that
1173                    is followed by something other than a blank or '\0',
1174                    or a value that is not in the range [1..INT_MAX].  */
1175                 errno = 0;
1176                 tmp = strtoul(scan, &end, 10);
1177                 if (errno != 0
1178                         || (*end != '\0' && ! is_blank(*end))
1179                                 || !(0 < tmp && tmp <= INT_MAX)
1180                 ) {
1181                         fatal_error = true;
1182                         break;
1183                 }
1184                 FIELDWIDTHS[i] = tmp;
1185                 scan = end;
1186                 /* Skip past any trailing blanks.  */
1187                 while (is_blank(*scan)) {
1188                         ++scan;
1189                 }
1190                 if (*scan == '\0')
1191                         break;
1192         }
1193         FIELDWIDTHS[i+1] = -1;
1194
1195         update_PROCINFO_str("FS", "FIELDWIDTHS");
1196         if (fatal_error)
1197                 fatal(_("invalid FIELDWIDTHS value, near `%s'"),
1198                               scan);
1199 }
1200
1201 /* set_FS --- handle things when FS is assigned to */
1202
1203 void
1204 set_FS()
1205 {
1206         char buf[10];
1207         NODE *fs;
1208         static NODE *save_fs = NULL;
1209         static NODE *save_rs = NULL;
1210         bool remake_re = true;
1211
1212         /*
1213          * If changing the way fields are split, obey least-surprise
1214          * semantics, and force $0 to be split totally.
1215          */
1216         if (fields_arr != NULL)
1217                 (void) get_field(UNLIMITED - 1, 0);
1218
1219         /* It's possible that only IGNORECASE changed, or FS = FS */
1220         /*
1221          * This comparison can't use cmp_nodes(), which pays attention
1222          * to IGNORECASE, and that's not what we want.
1223          */
1224         if (save_fs
1225                 && FS_node->var_value->stlen == save_fs->stlen
1226                 && memcmp(FS_node->var_value->stptr, save_fs->stptr, save_fs->stlen) == 0
1227                 && save_rs
1228                 && RS_node->var_value->stlen == save_rs->stlen
1229                 && memcmp(RS_node->var_value->stptr, save_rs->stptr, save_rs->stlen) == 0) {
1230                 if (FS_regexp != NULL)
1231                         FS_regexp = (IGNORECASE ? FS_re_no_case : FS_re_yes_case);
1232
1233                 /* FS = FS */
1234                 if (current_field_sep() == Using_FS) {
1235                         return;
1236                 } else {
1237                         remake_re = false;
1238                         goto choose_fs_function;
1239                 }
1240         }
1241
1242         unref(save_fs);
1243         save_fs = dupnode(FS_node->var_value);
1244         unref(save_rs);
1245         save_rs = dupnode(RS_node->var_value);
1246         resave_fs = true;
1247
1248         /* If FS_re_no_case assignment is fatal (make_regexp in remake_re)
1249          * FS_regexp will be NULL with a non-null FS_re_yes_case.
1250          * refree() handles null argument; no need for `if (FS_regexp != NULL)' below.
1251          * Please do not remerge.
1252          */
1253         refree(FS_re_yes_case);
1254         refree(FS_re_no_case);
1255         FS_re_yes_case = FS_re_no_case = FS_regexp = NULL;
1256
1257
1258 choose_fs_function:
1259         buf[0] = '\0';
1260         default_FS = false;
1261         fs = force_string(FS_node->var_value);
1262
1263         if (! do_traditional && fs->stlen == 0) {
1264                 static bool warned = false;
1265
1266                 parse_field = null_parse_field;
1267
1268                 if (do_lint && ! warned) {
1269                         warned = true;
1270                         lintwarn(_("null string for `FS' is a gawk extension"));
1271                 }
1272         } else if (fs->stlen > 1) {
1273                 if (do_lint_old)
1274                         warning(_("old awk does not support regexps as value of `FS'"));
1275                 parse_field = re_parse_field;
1276         } else if (RS_is_null) {
1277                 /* we know that fs->stlen <= 1 */
1278                 parse_field = sc_parse_field;
1279                 if (fs->stlen == 1) {
1280                         if (fs->stptr[0] == ' ') {
1281                                 default_FS = true;
1282                                 strcpy(buf, "[ \t\n]+");
1283                         } else if (fs->stptr[0] == '\\') {
1284                                 /* yet another special case */
1285                                 strcpy(buf, "[\\\\\n]");
1286                         } else if (fs->stptr[0] != '\n')
1287                                 sprintf(buf, "[%c\n]", fs->stptr[0]);
1288                 }
1289         } else {
1290                 if (do_posix)
1291                         parse_field = posix_def_parse_field;
1292                 else
1293                         parse_field = def_parse_field;
1294
1295                 if (fs->stlen == 1) {
1296                         if (fs->stptr[0] == ' ')
1297                                 default_FS = true;
1298                         else if (fs->stptr[0] == '\\')
1299                                 /* same special case */
1300                                 strcpy(buf, "[\\\\]");
1301                         else
1302                                 parse_field = sc_parse_field;
1303                 }
1304         }
1305         if (remake_re) {
1306                 refree(FS_re_yes_case);
1307                 refree(FS_re_no_case);
1308                 FS_re_yes_case = FS_re_no_case = FS_regexp = NULL;
1309
1310                 if (buf[0] != '\0') {
1311                         FS_re_yes_case = make_regexp(buf, strlen(buf), false, true, true);
1312                         FS_re_no_case = make_regexp(buf, strlen(buf), true, true, true);
1313                         FS_regexp = (IGNORECASE ? FS_re_no_case : FS_re_yes_case);
1314                         parse_field = re_parse_field;
1315                 } else if (parse_field == re_parse_field) {
1316                         FS_re_yes_case = make_regexp(fs->stptr, fs->stlen, false, true, true);
1317                         FS_re_no_case = make_regexp(fs->stptr, fs->stlen, true, true, true);
1318                         FS_regexp = (IGNORECASE ? FS_re_no_case : FS_re_yes_case);
1319                 } else
1320                         FS_re_yes_case = FS_re_no_case = FS_regexp = NULL;
1321         }
1322
1323         /*
1324          * For FS = "c", we don't use IGNORECASE. But we must use
1325          * re_parse_field to get the character and the newline as
1326          * field separators.
1327          */
1328         if (fs->stlen == 1 && parse_field == re_parse_field)
1329                 FS_regexp = FS_re_yes_case;
1330
1331         update_PROCINFO_str("FS", "FS");
1332 }
1333
1334 /* current_field_sep --- return what field separator is */
1335
1336 field_sep_type
1337 current_field_sep()
1338 {
1339         if (parse_field == fw_parse_field)
1340                 return Using_FIELDWIDTHS;
1341         else if (parse_field == fpat_parse_field)
1342                 return Using_FPAT;
1343         else
1344                 return Using_FS;
1345 }
1346
1347 /* update_PROCINFO_str --- update PROCINFO[sub] with string value */
1348
1349 void
1350 update_PROCINFO_str(const char *subscript, const char *str)
1351 {
1352         NODE **aptr;
1353         NODE *tmp;
1354
1355         if (PROCINFO_node == NULL)
1356                 return;
1357         tmp = make_string(subscript, strlen(subscript));
1358         aptr = assoc_lookup(PROCINFO_node, tmp);
1359         unref(tmp);
1360         unref(*aptr);
1361         *aptr = make_string(str, strlen(str));
1362 }
1363
1364 /* update_PROCINFO_num --- update PROCINFO[sub] with numeric value */
1365
1366 void
1367 update_PROCINFO_num(const char *subscript, AWKNUM val)
1368 {
1369         NODE **aptr;
1370         NODE *tmp;
1371
1372         if (PROCINFO_node == NULL)
1373                 return;
1374         tmp = make_string(subscript, strlen(subscript));
1375         aptr = assoc_lookup(PROCINFO_node, tmp);
1376         unref(tmp);
1377         unref(*aptr);
1378         *aptr = make_number(val);
1379 }
1380
1381 /* set_FPAT --- handle an assignment to FPAT */
1382
1383 void
1384 set_FPAT()
1385 {
1386         static bool warned = false;
1387         static NODE *save_fpat = NULL;
1388         bool remake_re = true;
1389         NODE *fpat;
1390
1391         if (do_lint && ! warned) {
1392                 warned = true;
1393                 lintwarn(_("`FPAT' is a gawk extension"));
1394         }
1395         if (do_traditional)     /* quick and dirty, does the trick */
1396                 return;
1397
1398         /*
1399          * If changing the way fields are split, obey least-suprise
1400          * semantics, and force $0 to be split totally.
1401          */
1402         if (fields_arr != NULL)
1403                 (void) get_field(UNLIMITED - 1, 0);
1404
1405         /* It's possible that only IGNORECASE changed, or FPAT = FPAT */
1406         /*
1407          * This comparison can't use cmp_nodes(), which pays attention
1408          * to IGNORECASE, and that's not what we want.
1409          */
1410         if (save_fpat
1411                 && FPAT_node->var_value->stlen == save_fpat->stlen
1412                 && memcmp(FPAT_node->var_value->stptr, save_fpat->stptr, save_fpat->stlen) == 0) {
1413                 if (FPAT_regexp != NULL)
1414                         FPAT_regexp = (IGNORECASE ? FPAT_re_no_case : FPAT_re_yes_case);
1415
1416                 /* FPAT = FPAT */
1417                 if (current_field_sep() == Using_FPAT) {
1418                         return;
1419                 } else {
1420                         remake_re = false;
1421                         goto set_fpat_function;
1422                 }
1423         }
1424
1425         unref(save_fpat);
1426         save_fpat = dupnode(FPAT_node->var_value);
1427         refree(FPAT_re_yes_case);
1428         refree(FPAT_re_no_case);
1429         FPAT_re_yes_case = FPAT_re_no_case = FPAT_regexp = NULL;
1430
1431 set_fpat_function:
1432         fpat = force_string(FPAT_node->var_value);
1433         parse_field = fpat_parse_field;
1434
1435         if (remake_re) {
1436                 refree(FPAT_re_yes_case);
1437                 refree(FPAT_re_no_case);
1438                 FPAT_re_yes_case = FPAT_re_no_case = FPAT_regexp = NULL;
1439
1440                 FPAT_re_yes_case = make_regexp(fpat->stptr, fpat->stlen, false, true, true);
1441                 FPAT_re_no_case = make_regexp(fpat->stptr, fpat->stlen, true, true, true);
1442                 FPAT_regexp = (IGNORECASE ? FPAT_re_no_case : FPAT_re_yes_case);
1443         }
1444
1445         update_PROCINFO_str("FS", "FPAT");
1446 }
1447
1448 /*
1449  * increment_scan --- macro to move scan pointer ahead by one character.
1450  *                      Implementation varies if doing MBS or not.
1451  */
1452
1453 #if MBS_SUPPORT
1454 #define increment_scan(scanp, len) incr_scan(scanp, len, & mbs)
1455 #else
1456 #define increment_scan(scanp, len) ((*scanp)++)
1457 #endif
1458
1459 #if MBS_SUPPORT
1460 /* incr_scan --- MBS version of increment_scan() */
1461
1462 static void
1463 incr_scan(char **scanp, size_t len, mbstate_t *mbs)
1464 {
1465         size_t mbclen = 0;
1466
1467         if (gawk_mb_cur_max > 1) {
1468                 mbclen = mbrlen(*scanp, len, mbs);
1469                 if (   (mbclen == 1)
1470                     || (mbclen == (size_t) -1)
1471                     || (mbclen == (size_t) -2)
1472                     || (mbclen == 0)) {
1473                         /* We treat it as a singlebyte character.  */
1474                         mbclen = 1;
1475                 }
1476                 *scanp += mbclen;
1477         } else
1478                 (*scanp)++;
1479 }
1480 #endif
1481
1482 /*
1483  * fpat_parse_field --- parse fields using a regexp.
1484  *
1485  * This is called both from get_field() and from do_patsplit()
1486  * via (*parse_field)().  This variation is for when FPAT is a regular
1487  * expression -- use the value to find field contents.
1488  *
1489  * This was really hard to get right.  It happens to bear many resemblances
1490  * to issues I had with getting gsub right with null matches. When dealing
1491  * with that I prototyped in awk and had the foresight to save the awk code
1492  * over in the C file.  Starting with that as a base, I finally got to this
1493  * awk code to do what I needed, and then translated it into C. Fortunately
1494  * the C code bears a closer correspondance to the awk code here than over
1495  * by gsub.
1496  *
1497  * BEGIN {
1498  *      false = 0
1499  *      true = 1
1500  *
1501  *      fpat[1] = "([^,]*)|(\"[^\"]+\")"
1502  *      fpat[2] = fpat[1]
1503  *      fpat[3] = fpat[1]
1504  *      fpat[4] = "aa+"
1505  *      fpat[5] = fpat[4]
1506  *
1507  *      data[1] = "Robbins,,Arnold,"
1508  *      data[2] = "Smith,,\"1234 A Pretty Place, NE\",Sometown,NY,12345-6789,USA"
1509  *      data[3] = "Robbins,Arnold,\"1234 A Pretty Place, NE\",Sometown,NY,12345-6789,USA"
1510  *      data[4] = "bbbaaacccdddaaaaaqqqq"
1511  *      data[5] = "bbbaaacccdddaaaaaqqqqa" # should get trailing qqqa
1512  *
1513  *      for (i = 1; i in data; i++) {
1514  *              printf("Splitting: <%s>\n", data[i])
1515  *              n = mypatsplit(data[i], fields, fpat[i], seps)
1516  *              print "n =", n
1517  *              for (j = 1; j <= n; j++)
1518  *                      printf("fields[%d] = <%s>\n", j, fields[j])
1519  *              for (j = 0; j in seps; j++)
1520  *                      printf("seps[%s] = <%s>\n", j, seps[j])
1521  *      }
1522  * }
1523  *
1524  * function mypatsplit(string, array, pattern, seps,
1525  *                      eosflag, non_empty, nf) # locals
1526  * {
1527  *      delete array
1528  *      delete seps
1529  *      if (length(string) == 0)
1530  *              return 0
1531  *
1532  *      eosflag = non_empty = false
1533  *      nf = 0
1534  *      while (match(string, pattern)) {
1535  *              if (RLENGTH > 0) {      # easy case
1536  *                      non_empty = true
1537  *                      if (! (nf in seps)) {
1538  *                              if (RSTART == 1)        # match at front of string
1539  *                                      seps[nf] = ""
1540  *                              else
1541  *                                      seps[nf] = substr(string, 1, RSTART - 1)
1542  *                      }
1543  *                      array[++nf] = substr(string, RSTART, RLENGTH)
1544  *                      string = substr(string, RSTART+RLENGTH)
1545  *                      if (length(string) == 0)
1546  *                              break
1547  *              } else if (non_empty) {
1548  *                      # last match was non-empty, and at the
1549  *                      # current character we get a zero length match,
1550  *                      # which we don't want, so skip over it
1551  *                      non_empty = false
1552  *                      seps[nf] = substr(string, 1, 1)
1553  *                      string = substr(string, 2)
1554  *              } else {
1555  *                      # 0 length match
1556  *                      if (! (nf in seps)) {
1557  *                              if (RSTART == 1)
1558  *                                      seps[nf] = ""
1559  *                              else
1560  *                                      seps[nf] = substr(string, 1, RSTART - 1)
1561  *                      }
1562  *                      array[++nf] = ""
1563  *                      if (! non_empty && ! eosflag) { # prev was empty
1564  *                              seps[nf] = substr(string, 1, 1)
1565  *                      }
1566  *                      if (RSTART == 1) {
1567  *                              string = substr(string, 2)
1568  *                      } else {
1569  *                              string = substr(string, RSTART + 1)
1570  *                      }
1571  *                      non_empty = false
1572  *              }
1573  *              if (length(string) == 0) {
1574  *                      if (eosflag)
1575  *                              break
1576  *                      else
1577  *                              eosflag = true
1578  *              }
1579  *      }
1580  *      if (length(string) > 0)
1581  *              seps[nf] = string
1582  *
1583  *      return length(array)
1584  * }
1585  */
1586 static long
1587 fpat_parse_field(long up_to,    /* parse only up to this field number */
1588         char **buf,     /* on input: string to parse; on output: point to start next */
1589         int len,
1590         NODE *fs ATTRIBUTE_UNUSED,
1591         Regexp *rp,
1592         Setfunc set,    /* routine to set the value of the parsed field */
1593         NODE *n,
1594         NODE *sep_arr,  /* array of field separators (may be NULL) */
1595         bool in_middle)
1596 {
1597         char *scan = *buf;
1598         long nf = parse_high_water;
1599         char *start;
1600         char *end = scan + len;
1601         int regex_flags = RE_NEED_START;
1602         bool need_to_set_sep;
1603         bool non_empty;
1604         bool eosflag;
1605 #if MBS_SUPPORT
1606         mbstate_t mbs;
1607
1608         if (gawk_mb_cur_max > 1)
1609                 memset(&mbs, 0, sizeof(mbstate_t));
1610 #endif
1611
1612         if (up_to == UNLIMITED)
1613                 nf = 0;
1614
1615         if (len == 0)
1616                 return nf;
1617
1618         if (rp == NULL) /* use FPAT */
1619                 rp = FPAT_regexp;
1620
1621         if (in_middle) {
1622                 regex_flags |= RE_NO_BOL;
1623                 non_empty = rp->non_empty;
1624         } else
1625                 non_empty = false;
1626
1627         eosflag = false;
1628         need_to_set_sep = true;
1629         start = scan;
1630         while (research(rp, scan, 0, (end - scan), regex_flags) != -1
1631                && nf < up_to) {
1632
1633                 if (REEND(rp, scan) > RESTART(rp, scan)) { /* if (RLENGTH > 0) */
1634                         non_empty = true;
1635                         if (sep_arr != NULL && need_to_set_sep) {
1636                                 if (RESTART(rp, scan) == 0) /* match at front */
1637                                         set_element(nf, start, 0L, sep_arr);
1638                                 else
1639                                         set_element(nf,
1640                                                 start,
1641                                                 (long) RESTART(rp, scan),
1642                                                 sep_arr);
1643                         }
1644                         /* field is text that matched */
1645                         (*set)(++nf,
1646                                 scan + RESTART(rp, scan),
1647                                 (long)(REEND(rp, scan) - RESTART(rp, scan)),
1648                                 n);
1649
1650                         scan += REEND(rp, scan);
1651                         if (scan >= end)
1652                                 break;
1653                         need_to_set_sep = true;
1654                 } else if (non_empty) { /* else if non_empty */
1655                         /*
1656                          * last match was non-empty, and at the
1657                          * current character we get a zero length match,
1658                          * which we don't want, so skip over it
1659                          */
1660                         non_empty = false;
1661                         if (sep_arr != NULL) {
1662                                 need_to_set_sep = false;
1663                                 set_element(nf, start, 1L, sep_arr);
1664                         }
1665                         increment_scan(& scan, end - scan);
1666                 } else {
1667                         /* 0 length match */
1668                         if (sep_arr != NULL && need_to_set_sep) {
1669                                 if (RESTART(rp, scan) == 0) /* RSTART == 1 */
1670                                         set_element(nf, start, 0L, sep_arr);
1671                                 else
1672                                         set_element(nf, start,
1673                                                         (long) RESTART(rp, scan),
1674                                                         sep_arr);
1675                         }
1676                         need_to_set_sep = true;
1677                         (*set)(++nf, scan, 0L, n);
1678                         if (! non_empty && ! eosflag) { /* prev was empty */
1679                                 if (sep_arr != NULL) {
1680                                         set_element(nf, start, 1L, sep_arr);
1681                                         need_to_set_sep = false;
1682                                 }
1683                         }
1684                         if (RESTART(rp, scan) == 0)
1685                                 increment_scan(& scan, end - scan);
1686                         else {
1687                                 scan += RESTART(rp, scan);
1688                         }
1689                         non_empty = false;
1690                 }
1691                 if (scan >= end) { /* length(string) == 0 */
1692                         if (eosflag)
1693                                 break;
1694                         else
1695                                 eosflag = true;
1696                 }
1697
1698                 start = scan;
1699         }
1700         if (scan < end) {
1701                 if (sep_arr != NULL)
1702                         set_element(nf, scan, (long) (end - scan), sep_arr);
1703         }
1704
1705         *buf = scan;
1706         rp->non_empty = non_empty;
1707         return nf;
1708 }