2 * field.c - routines for dealing with fields and record parsing
6 * Copyright (C) 1986, 1988, 1989, 1991-2014 the Free Software Foundation, Inc.
8 * This file is part of GAWK, the GNU implementation of the
9 * AWK Programming Language.
11 * GAWK is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 3 of the License, or
14 * (at your option) any later version.
16 * GAWK is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
29 * In case that the system doesn't have isblank().
30 * Don't bother with autoconf ifdef junk, just force it.
31 * See dfa.c and regex_internal.h and regcomp.c. Bleah.
36 return c == ' ' || c == '\t';
39 typedef void (* Setfunc)(long, char *, long, NODE *);
41 static long (*parse_field)(long, char **, int, NODE *,
42 Regexp *, Setfunc, NODE *, NODE *, bool);
43 static void rebuild_record(void);
44 static long re_parse_field(long, char **, int, NODE *,
45 Regexp *, Setfunc, NODE *, NODE *, bool);
46 static long def_parse_field(long, char **, int, NODE *,
47 Regexp *, Setfunc, NODE *, NODE *, bool);
48 static long posix_def_parse_field(long, char **, int, NODE *,
49 Regexp *, Setfunc, NODE *, NODE *, bool);
50 static long null_parse_field(long, char **, int, NODE *,
51 Regexp *, Setfunc, NODE *, NODE *, bool);
52 static long sc_parse_field(long, char **, int, NODE *,
53 Regexp *, Setfunc, NODE *, NODE *, bool);
54 static long fw_parse_field(long, char **, int, NODE *,
55 Regexp *, Setfunc, NODE *, NODE *, bool);
56 static long fpat_parse_field(long, char **, int, NODE *,
57 Regexp *, Setfunc, NODE *, NODE *, bool);
58 static void set_element(long num, char * str, long len, NODE *arr);
59 static void grow_fields_arr(long num);
60 static void set_field(long num, char *str, long len, NODE *dummy);
62 static char *parse_extent; /* marks where to restart parse of record */
63 static long parse_high_water = 0; /* field number that we have parsed so far */
64 static long nf_high_water = 0; /* size of fields_arr */
65 static bool resave_fs;
66 static NODE *save_FS; /* save current value of FS when line is read,
67 * to be used in deferred parsing
69 static int *FIELDWIDTHS = NULL;
71 NODE **fields_arr; /* array of pointers to the field nodes */
72 bool field0_valid; /* $(>0) has not been changed yet */
73 int default_FS; /* true when FS == " " */
74 Regexp *FS_re_yes_case = NULL;
75 Regexp *FS_re_no_case = NULL;
76 Regexp *FS_regexp = NULL;
77 Regexp *FPAT_re_yes_case = NULL;
78 Regexp *FPAT_re_no_case = NULL;
79 Regexp *FPAT_regexp = NULL;
80 NODE *Null_field = NULL;
82 /* init_fields --- set up the fields array to start with */
87 emalloc(fields_arr, NODE **, sizeof(NODE *), "init_fields");
89 getnode(fields_arr[0]);
90 *fields_arr[0] = *Nnull_string;
91 fields_arr[0]->flags |= NULL_FIELD;
93 parse_extent = fields_arr[0]->stptr;
94 save_FS = dupnode(FS_node->var_value);
97 *Null_field = *Nnull_string;
98 Null_field->valref = 1;
99 Null_field->flags = (FIELD|STRCUR|STRING|NULL_FIELD);
104 /* grow_fields --- acquire new fields as needed */
107 grow_fields_arr(long num)
112 erealloc(fields_arr, NODE **, (num + 1) * sizeof(NODE *), "grow_fields_arr");
113 for (t = nf_high_water + 1; t <= num; t++) {
121 /* set_field --- set the value of a particular field */
128 NODE *dummy ATTRIBUTE_UNUSED) /* just to make interface same as set_element */
132 if (num > nf_high_water)
133 grow_fields_arr(num);
137 n->flags = (STRCUR|STRING|MAYBE_NUM|FIELD);
140 /* rebuild_record --- Someone assigned a value to $(something).
141 Fix up $0 to be right */
147 * use explicit unsigned longs for lengths, in case
148 * a size_t isn't big enough.
151 unsigned long ofslen;
161 ofs = force_string(OFS_node->var_value);
163 for (i = NF; i > 0; i--) {
165 tmp = force_string(tmp);
168 tlen += (NF - 1) * ofslen;
171 emalloc(ops, char *, tlen + 2, "rebuild_record");
174 for (i = 1; i <= NF; i++) {
175 free_wstr(fields_arr[i]);
179 *cops++ = tmp->stptr[0];
180 else if (tmp->stlen != 0) {
181 memcpy(cops, tmp->stptr, tmp->stlen);
187 *cops++ = ofs->stptr[0];
188 else if (ofslen != 0) {
189 memcpy(cops, ofs->stptr, ofslen);
194 tmp = make_str_node(ops, tlen, ALREADY_MALLOCED);
197 * Since we are about to unref fields_arr[0], we want to find
198 * any fields that still point into it, and have them point
199 * into the new field zero. This has to be done intelligently,
200 * so that unrefing a field doesn't try to unref into the old $0.
202 for (cops = ops, i = 1; i <= NF; i++) {
203 NODE *r = fields_arr[i];
208 if ((r->flags & FIELD) == 0) {
211 if ((r->flags & (NUMCUR|NUMBER)) != 0) {
212 n->flags |= (r->flags & (MPFN|MPZN|NUMCUR|NUMBER));
214 if (is_mpg_float(r)) {
215 mpfr_init(n->mpg_numbr);
216 mpfr_set(n->mpg_numbr, r->mpg_numbr, ROUND_MODE);
217 } else if (is_mpg_integer(r)) {
219 mpz_set(n->mpg_i, r->mpg_i);
226 n->flags &= ~(MALLOC|STRING);
232 assert((n->flags & WSTRCUR) == 0);
234 cops += fields_arr[i]->stlen + ofslen;
237 unref(fields_arr[0]);
245 * setup $0, but defer parsing rest of line until reference is made to $(>0)
246 * or to NF. At that point, parse only as much as necessary.
248 * Manage a private buffer for the contents of $0. Doing so keeps us safe
249 * if `getline var' decides to rearrange the contents of the IOBUF that
250 * $0 might have been pointing into. The cost is the copying of the buffer;
251 * but better correct than fast.
254 set_record(const char *buf, int cnt)
257 static char *databuf;
258 static unsigned long databuf_size;
259 #define INITIAL_SIZE 512
260 #define MAX_SIZE ((unsigned long) ~0) /* maximally portable ... */
264 /* buffer management: */
265 if (databuf_size == 0) { /* first time */
266 emalloc(databuf, char *, INITIAL_SIZE, "set_record");
267 databuf_size = INITIAL_SIZE;
268 memset(databuf, '\0', INITIAL_SIZE);
272 * Make sure there's enough room. Since we sometimes need
273 * to place a sentinel at the end, we make sure
274 * databuf_size is > cnt after allocation.
276 if (cnt >= databuf_size) {
277 while (cnt >= databuf_size && databuf_size <= MAX_SIZE)
279 erealloc(databuf, char *, databuf_size, "set_record");
280 memset(databuf, '\0', databuf_size);
283 memcpy(databuf, buf, cnt);
285 /* manage field 0: */
286 unref(fields_arr[0]);
293 n->flags = (STRING|STRCUR|MAYBE_NUM|FIELD);
300 /* reset_record --- start over again with current $0 */
308 fields_arr[0] = force_string(fields_arr[0]);
311 for (i = 1; i <= parse_high_water; i++) {
312 unref(fields_arr[i]);
318 parse_high_water = 0;
320 * $0 = $0 should resplit using the current value of FS.
325 save_FS = dupnode(FS_node->var_value);
331 /* set_NF --- handle what happens to $0 and fields when NF is changed */
342 (void) force_number(NF_node->var_value);
343 nf = get_number_si(NF_node->var_value);
345 fatal(_("NF set to negative value"));
348 if (NF > nf_high_water)
350 if (parse_high_water < NF) {
351 for (i = parse_high_water + 1; i >= 0 && i <= NF; i++) {
352 unref(fields_arr[i]);
357 parse_high_water = NF;
358 } else if (parse_high_water > 0) {
359 for (i = NF + 1; i >= 0 && i <= parse_high_water; i++) {
360 unref(fields_arr[i]);
365 parse_high_water = NF;
367 field0_valid = false;
371 * re_parse_field --- parse fields using a regexp.
373 * This is called both from get_field() and from do_split()
374 * via (*parse_field)(). This variation is for when FS is a regular
375 * expression -- either user-defined or because RS=="" and FS==" "
378 re_parse_field(long up_to, /* parse only up to this field number */
379 char **buf, /* on input: string to parse; on output: point to start next */
381 NODE *fs ATTRIBUTE_UNUSED,
383 Setfunc set, /* routine to set the value of the parsed field */
385 NODE *sep_arr, /* array of field separators (maybe NULL) */
389 long nf = parse_high_water;
391 char *end = scan + len;
392 int regex_flags = RE_NEED_START;
397 if (gawk_mb_cur_max > 1)
398 memset(&mbs, 0, sizeof(mbstate_t));
402 regex_flags |= RE_NO_BOL;
404 if (up_to == UNLIMITED)
409 if (RS_is_null && default_FS) {
411 while (scan < end && (*scan == ' ' || *scan == '\t' || *scan == '\n'))
413 if (sep_arr != NULL && sep < scan)
414 set_element(nf, sep, (long)(scan - sep), sep_arr);
417 if (rp == NULL) /* use FS */
422 && research(rp, scan, 0, (end - scan), regex_flags) != -1
424 regex_flags |= RE_NO_BOL;
425 if (REEND(rp, scan) == RESTART(rp, scan)) { /* null match */
427 if (gawk_mb_cur_max > 1) {
428 mbclen = mbrlen(scan, end-scan, &mbs);
429 if ((mbclen == 1) || (mbclen == (size_t) -1)
430 || (mbclen == (size_t) -2) || (mbclen == 0)) {
431 /* We treat it as a singlebyte character. */
439 (*set)(++nf, field, (long)(scan - field), n);
446 (long)(scan + RESTART(rp, scan) - field), n);
448 set_element(nf, scan + RESTART(rp, scan),
449 (long) (REEND(rp, scan) - RESTART(rp, scan)), sep_arr);
450 scan += REEND(rp, scan);
452 if (scan == end) /* FS at end of record */
453 (*set)(++nf, field, 0L, n);
455 if (nf != up_to && scan < end) {
456 (*set)(++nf, scan, (long)(end - scan), n);
464 * def_parse_field --- default field parsing.
466 * This is called both from get_field() and from do_split()
467 * via (*parse_field)(). This variation is for when FS is a single space
472 def_parse_field(long up_to, /* parse only up to this field number */
473 char **buf, /* on input: string to parse; on output: point to start next */
476 Regexp *rp ATTRIBUTE_UNUSED,
477 Setfunc set, /* routine to set the value of the parsed field */
479 NODE *sep_arr, /* array of field separators (maybe NULL) */
480 bool in_middle ATTRIBUTE_UNUSED)
483 long nf = parse_high_water;
485 char *end = scan + len;
489 if (up_to == UNLIMITED)
495 * Nasty special case. If FS set to "", return whole record
496 * as first field. This is not worth a separate function.
498 if (fs->stlen == 0) {
499 (*set)(++nf, *buf, len, n);
504 /* before doing anything save the char at *end */
506 /* because it will be destroyed now: */
508 *end = ' '; /* sentinel character */
510 for (; nf < up_to; scan++) {
512 * special case: fs is single space, strip leading whitespace
514 while (scan < end && (*scan == ' ' || *scan == '\t' || *scan == '\n'))
517 if (sep_arr != NULL && scan > sep)
518 set_element(nf, sep, (long) (scan - sep), sep_arr);
525 while (*scan != ' ' && *scan != '\t' && *scan != '\n')
528 (*set)(++nf, field, (long)(scan - field), n);
536 /* everything done, restore original char at *end */
544 * posix_def_parse_field --- default field parsing.
546 * This is called both from get_field() and from do_split()
547 * via (*parse_field)(). This variation is for when FS is a single space
548 * character. The only difference between this and def_parse_field()
549 * is that this one does not allow newlines to separate fields.
553 posix_def_parse_field(long up_to, /* parse only up to this field number */
554 char **buf, /* on input: string to parse; on output: point to start next */
557 Regexp *rp ATTRIBUTE_UNUSED,
558 Setfunc set, /* routine to set the value of the parsed field */
560 NODE *dummy ATTRIBUTE_UNUSED, /* sep_arr not needed here: hence dummy */
561 bool in_middle ATTRIBUTE_UNUSED)
564 long nf = parse_high_water;
566 char *end = scan + len;
569 if (up_to == UNLIMITED)
575 * Nasty special case. If FS set to "", return whole record
576 * as first field. This is not worth a separate function.
578 if (fs->stlen == 0) {
579 (*set)(++nf, *buf, len, n);
584 /* before doing anything save the char at *end */
586 /* because it will be destroyed now: */
588 *end = ' '; /* sentinel character */
589 for (; nf < up_to; scan++) {
591 * special case: fs is single space, strip leading whitespace
593 while (scan < end && (*scan == ' ' || *scan == '\t'))
598 while (*scan != ' ' && *scan != '\t')
600 (*set)(++nf, field, (long)(scan - field), n);
605 /* everything done, restore original char at *end */
613 * null_parse_field --- each character is a separate field
615 * This is called both from get_field() and from do_split()
616 * via (*parse_field)(). This variation is for when FS is the null string.
619 null_parse_field(long up_to, /* parse only up to this field number */
620 char **buf, /* on input: string to parse; on output: point to start next */
622 NODE *fs ATTRIBUTE_UNUSED,
623 Regexp *rp ATTRIBUTE_UNUSED,
624 Setfunc set, /* routine to set the value of the parsed field */
626 NODE *sep_arr, /* array of field separators (maybe NULL) */
627 bool in_middle ATTRIBUTE_UNUSED)
630 long nf = parse_high_water;
631 char *end = scan + len;
633 if (up_to == UNLIMITED)
639 if (gawk_mb_cur_max > 1) {
641 memset(&mbs, 0, sizeof(mbstate_t));
642 for (; nf < up_to && scan < end;) {
643 size_t mbclen = mbrlen(scan, end-scan, &mbs);
644 if ((mbclen == 1) || (mbclen == (size_t) -1)
645 || (mbclen == (size_t) -2) || (mbclen == 0)) {
646 /* We treat it as a singlebyte character. */
649 if (sep_arr != NULL && nf > 0)
650 set_element(nf, scan, 0L, sep_arr);
651 (*set)(++nf, scan, mbclen, n);
656 for (; nf < up_to && scan < end; scan++) {
657 if (sep_arr != NULL && nf > 0)
658 set_element(nf, scan, 0L, sep_arr);
659 (*set)(++nf, scan, 1L, n);
667 * sc_parse_field --- single character field separator
669 * This is called both from get_field() and from do_split()
670 * via (*parse_field)(). This variation is for when FS is a single character
674 sc_parse_field(long up_to, /* parse only up to this field number */
675 char **buf, /* on input: string to parse; on output: point to start next */
678 Regexp *rp ATTRIBUTE_UNUSED,
679 Setfunc set, /* routine to set the value of the parsed field */
681 NODE *sep_arr, /* array of field separators (maybe NULL) */
682 bool in_middle ATTRIBUTE_UNUSED)
686 long nf = parse_high_water;
688 char *end = scan + len;
693 if (gawk_mb_cur_max > 1)
694 memset(&mbs, 0, sizeof(mbstate_t));
697 if (up_to == UNLIMITED)
702 if (RS_is_null && fs->stlen == 0)
705 fschar = fs->stptr[0];
707 /* before doing anything save the char at *end */
709 /* because it will be destroyed now: */
710 *end = fschar; /* sentinel character */
712 for (; nf < up_to;) {
715 if (gawk_mb_cur_max > 1) {
716 while (*scan != fschar) {
717 mbclen = mbrlen(scan, end-scan, &mbs);
718 if ((mbclen == 1) || (mbclen == (size_t) -1)
719 || (mbclen == (size_t) -2) || (mbclen == 0)) {
720 /* We treat it as a singlebyte character. */
727 while (*scan != fschar)
729 (*set)(++nf, field, (long)(scan - field), n);
733 set_element(nf, scan, 1L, sep_arr);
735 if (scan == end) { /* FS at end of record */
736 (*set)(++nf, field, 0L, n);
741 /* everything done, restore original char at *end */
749 * fw_parse_field --- field parsing using FIELDWIDTHS spec
751 * This is called from get_field() via (*parse_field)().
752 * This variation is for fields are fixed widths.
755 fw_parse_field(long up_to, /* parse only up to this field number */
756 char **buf, /* on input: string to parse; on output: point to start next */
758 NODE *fs ATTRIBUTE_UNUSED,
759 Regexp *rp ATTRIBUTE_UNUSED,
760 Setfunc set, /* routine to set the value of the parsed field */
762 NODE *dummy ATTRIBUTE_UNUSED, /* sep_arr not needed here: hence dummy */
763 bool in_middle ATTRIBUTE_UNUSED)
766 long nf = parse_high_water;
767 char *end = scan + len;
776 memset(&mbs, 0, sizeof(mbstate_t));
779 if (up_to == UNLIMITED)
783 for (; nf < up_to && (len = FIELDWIDTHS[nf+1]) != -1; ) {
785 if (gawk_mb_cur_max > 1) {
789 lenrest = end - scan;
790 while (nmbc < len && mbslen < lenrest) {
791 mbclen = mbrlen(mbscan, end - mbscan, &mbs);
793 || mbclen == (size_t) -1
794 || mbclen == (size_t) -2
796 /* We treat it as a singlebyte character. */
799 if (mbclen <= end - mbscan) {
805 (*set)(++nf, scan, (long) mbslen, n);
811 if (len > end - scan)
813 (*set)(++nf, scan, (long) len, n);
824 /* invalidate_field0 --- $0 needs reconstruction */
829 field0_valid = false;
832 /* get_field --- return a particular $n */
834 /* assign is not NULL if this field is on the LHS of an assign */
837 get_field(long requested, Func_ptr *assign)
839 bool in_middle = false;
841 * if requesting whole line but some other field has been altered,
842 * then the whole line must be rebuilt
844 if (requested == 0) {
845 if (! field0_valid) {
846 /* first, parse remainder of input record */
848 NF = (*parse_field)(UNLIMITED - 1, &parse_extent,
849 fields_arr[0]->stlen -
850 (parse_extent - fields_arr[0]->stptr),
851 save_FS, FS_regexp, set_field,
855 parse_high_water = NF;
860 *assign = reset_record;
861 return &fields_arr[0];
864 /* assert(requested > 0); */
868 field0_valid = false; /* $0 needs reconstruction */
871 * Keep things uniform. Also, mere intention of assigning something
872 * to $n should not make $0 invalid. Makes sense to invalidate $0
873 * after the actual assignment is performed. Not a real issue in
874 * the interpreter otherwise, but causes problem in the
875 * debugger when watching or printing fields.
879 *assign = invalidate_field0; /* $0 needs reconstruction */
882 if (requested <= parse_high_water) /* already parsed this field */
883 return &fields_arr[requested];
885 if (NF == -1) { /* have not yet parsed to end of record */
887 * parse up to requested fields, calling set_field() for each,
888 * saving in parse_extent the point where the parse left off
890 if (parse_high_water == 0) /* starting at the beginning */
891 parse_extent = fields_arr[0]->stptr;
894 parse_high_water = (*parse_field)(requested, &parse_extent,
895 fields_arr[0]->stlen - (parse_extent - fields_arr[0]->stptr),
896 save_FS, NULL, set_field, (NODE *) NULL, (NODE *) NULL, in_middle);
899 * if we reached the end of the record, set NF to the number of
900 * fields so far. Note that requested might actually refer to
901 * a field that is beyond the end of the record, but we won't
902 * set NF to that value at this point, since this is only a
903 * reference to the field and NF only gets set if the field
904 * is assigned to -- this case is handled below
906 if (parse_extent == fields_arr[0]->stptr + fields_arr[0]->stlen)
907 NF = parse_high_water;
908 else if (parse_field == fpat_parse_field) {
909 /* FPAT parsing is wierd, isolate the special cases */
910 char *rec_start = fields_arr[0]->stptr;
911 char *rec_end = fields_arr[0]->stptr + fields_arr[0]->stlen;
913 if ( parse_extent > rec_end
914 || (parse_extent > rec_start && parse_extent < rec_end && requested == UNLIMITED-1))
915 NF = parse_high_water;
916 else if (parse_extent == rec_start) /* could be no match for FPAT */
919 if (requested == UNLIMITED - 1) /* UNLIMITED-1 means set NF */
920 requested = parse_high_water;
922 if (parse_high_water < requested) { /* requested beyond end of record */
923 if (assign != NULL) { /* expand record */
924 if (requested > nf_high_water)
925 grow_fields_arr(requested);
928 parse_high_water = requested;
933 return &fields_arr[requested];
936 /* set_element --- set an array element, used by do_split() */
939 set_element(long num, char *s, long len, NODE *n)
945 it = make_string(s, len);
946 it->flags |= MAYBE_NUM;
947 sub = make_number((AWKNUM) (num));
948 lhs = assoc_lookup(n, sub);
951 if (n->astore != NULL)
952 (*n->astore)(n, sub);
956 /* do_split --- implement split(), semantics are same as for field splitting */
961 NODE *src, *arr, *sep, *fs, *tmp, *sep_arr = NULL;
963 long (*parseit)(long, char **, int, NODE *,
964 Regexp *, Setfunc, NODE *, NODE *, bool);
968 static bool warned1 = false, warned2 = false;
970 if (do_traditional || do_posix) {
971 fatal(_("split: fourth argument is a gawk extension"));
973 sep_arr = POP_PARAM();
974 if (sep_arr->type != Node_var_array)
975 fatal(_("split: fourth argument is not an array"));
976 if (do_lint && ! warned1) {
978 lintwarn(_("split: fourth argument is a gawk extension"));
980 if (do_lint_old && ! warned2) {
982 warning(_("split: fourth argument is a gawk extension"));
988 if (arr->type != Node_var_array)
989 fatal(_("split: second argument is not an array"));
991 if (sep_arr != NULL) {
993 fatal(_("split: cannot use the same array for second and fourth args"));
995 /* This checks need to be done before clearing any of the arrays */
996 for (tmp = sep_arr->parent_array; tmp != NULL; tmp = tmp->parent_array)
998 fatal(_("split: cannot use a subarray of second arg for fourth arg"));
999 for (tmp = arr->parent_array; tmp != NULL; tmp = tmp->parent_array)
1001 fatal(_("split: cannot use a subarray of fourth arg for second arg"));
1002 assoc_clear(sep_arr);
1007 if (src->stlen == 0) {
1009 * Skip the work if first arg is the null string.
1013 return make_number((AWKNUM) 0);
1016 if ( (sep->re_flags & FS_DFLT) != 0
1017 && current_field_sep() == Using_FS
1019 parseit = parse_field;
1020 fs = force_string(FS_node->var_value);
1025 if (fs->stlen == 0) {
1026 static bool warned = false;
1028 parseit = null_parse_field;
1030 if (do_lint && ! warned) {
1032 lintwarn(_("split: null string for third arg is a gawk extension"));
1034 } else if (fs->stlen == 1 && (sep->re_flags & CONSTANT) == 0) {
1035 if (fs->stptr[0] == ' ') {
1037 parseit = posix_def_parse_field;
1039 parseit = def_parse_field;
1041 parseit = sc_parse_field;
1043 parseit = re_parse_field;
1044 rp = re_update(sep);
1049 tmp = make_number((AWKNUM) (*parseit)(UNLIMITED, &s, (int) src->stlen,
1050 fs, rp, set_element, arr, sep_arr, false));
1052 src = POP_SCALAR(); /* really pop off stack */
1058 * do_patsplit --- implement patsplit(), semantics are same as for field
1059 * splitting with FPAT.
1063 do_patsplit(int nargs)
1065 NODE *src, *arr, *sep, *fpat, *tmp, *sep_arr = NULL;
1070 sep_arr = POP_PARAM();
1071 if (sep_arr->type != Node_var_array)
1072 fatal(_("patsplit: fourth argument is not an array"));
1076 if (arr->type != Node_var_array)
1077 fatal(_("patsplit: second argument is not an array"));
1082 if (fpat->stlen == 0)
1083 fatal(_("patsplit: third argument must be non-null"));
1085 if (sep_arr != NULL) {
1087 fatal(_("patsplit: cannot use the same array for second and fourth args"));
1089 /* These checks need to be done before clearing any of the arrays */
1090 for (tmp = sep_arr->parent_array; tmp != NULL; tmp = tmp->parent_array)
1092 fatal(_("patsplit: cannot use a subarray of second arg for fourth arg"));
1093 for (tmp = arr->parent_array; tmp != NULL; tmp = tmp->parent_array)
1095 fatal(_("patsplit: cannot use a subarray of fourth arg for second arg"));
1096 assoc_clear(sep_arr);
1100 if (src->stlen == 0) {
1102 * Skip the work if first arg is the null string.
1104 tmp = make_number((AWKNUM) 0);
1106 rp = re_update(sep);
1108 tmp = make_number((AWKNUM) fpat_parse_field(UNLIMITED, &s,
1109 (int) src->stlen, fpat, rp,
1110 set_element, arr, sep_arr, false));
1113 src = POP_SCALAR(); /* really pop off stack */
1118 /* set_FIELDWIDTHS --- handle an assignment to FIELDWIDTHS */
1126 static int fw_alloc = 4;
1127 static bool warned = false;
1128 bool fatal_error = false;
1131 if (do_lint && ! warned) {
1133 lintwarn(_("`FIELDWIDTHS' is a gawk extension"));
1135 if (do_traditional) /* quick and dirty, does the trick */
1139 * If changing the way fields are split, obey least-suprise
1140 * semantics, and force $0 to be split totally.
1142 if (fields_arr != NULL)
1143 (void) get_field(UNLIMITED - 1, 0);
1145 parse_field = fw_parse_field;
1146 tmp = force_string(FIELDWIDTHS_node->var_value);
1149 if (FIELDWIDTHS == NULL)
1150 emalloc(FIELDWIDTHS, int *, fw_alloc * sizeof(int), "set_FIELDWIDTHS");
1152 for (i = 1; ; i++) {
1153 unsigned long int tmp;
1154 if (i + 2 >= fw_alloc) {
1156 erealloc(FIELDWIDTHS, int *, fw_alloc * sizeof(int), "set_FIELDWIDTHS");
1158 /* Initialize value to be end of list */
1159 FIELDWIDTHS[i] = -1;
1160 /* Ensure that there is no leading `-' sign. Otherwise,
1161 strtoul would accept it and return a bogus result. */
1162 while (is_blank(*scan)) {
1172 /* Detect an invalid base-10 integer, a valid value that
1173 is followed by something other than a blank or '\0',
1174 or a value that is not in the range [1..INT_MAX]. */
1176 tmp = strtoul(scan, &end, 10);
1178 || (*end != '\0' && ! is_blank(*end))
1179 || !(0 < tmp && tmp <= INT_MAX)
1184 FIELDWIDTHS[i] = tmp;
1186 /* Skip past any trailing blanks. */
1187 while (is_blank(*scan)) {
1193 FIELDWIDTHS[i+1] = -1;
1195 update_PROCINFO_str("FS", "FIELDWIDTHS");
1197 fatal(_("invalid FIELDWIDTHS value, near `%s'"),
1201 /* set_FS --- handle things when FS is assigned to */
1208 static NODE *save_fs = NULL;
1209 static NODE *save_rs = NULL;
1210 bool remake_re = true;
1213 * If changing the way fields are split, obey least-surprise
1214 * semantics, and force $0 to be split totally.
1216 if (fields_arr != NULL)
1217 (void) get_field(UNLIMITED - 1, 0);
1219 /* It's possible that only IGNORECASE changed, or FS = FS */
1221 * This comparison can't use cmp_nodes(), which pays attention
1222 * to IGNORECASE, and that's not what we want.
1225 && FS_node->var_value->stlen == save_fs->stlen
1226 && memcmp(FS_node->var_value->stptr, save_fs->stptr, save_fs->stlen) == 0
1228 && RS_node->var_value->stlen == save_rs->stlen
1229 && memcmp(RS_node->var_value->stptr, save_rs->stptr, save_rs->stlen) == 0) {
1230 if (FS_regexp != NULL)
1231 FS_regexp = (IGNORECASE ? FS_re_no_case : FS_re_yes_case);
1234 if (current_field_sep() == Using_FS) {
1238 goto choose_fs_function;
1243 save_fs = dupnode(FS_node->var_value);
1245 save_rs = dupnode(RS_node->var_value);
1248 /* If FS_re_no_case assignment is fatal (make_regexp in remake_re)
1249 * FS_regexp will be NULL with a non-null FS_re_yes_case.
1250 * refree() handles null argument; no need for `if (FS_regexp != NULL)' below.
1251 * Please do not remerge.
1253 refree(FS_re_yes_case);
1254 refree(FS_re_no_case);
1255 FS_re_yes_case = FS_re_no_case = FS_regexp = NULL;
1261 fs = force_string(FS_node->var_value);
1263 if (! do_traditional && fs->stlen == 0) {
1264 static bool warned = false;
1266 parse_field = null_parse_field;
1268 if (do_lint && ! warned) {
1270 lintwarn(_("null string for `FS' is a gawk extension"));
1272 } else if (fs->stlen > 1) {
1274 warning(_("old awk does not support regexps as value of `FS'"));
1275 parse_field = re_parse_field;
1276 } else if (RS_is_null) {
1277 /* we know that fs->stlen <= 1 */
1278 parse_field = sc_parse_field;
1279 if (fs->stlen == 1) {
1280 if (fs->stptr[0] == ' ') {
1282 strcpy(buf, "[ \t\n]+");
1283 } else if (fs->stptr[0] == '\\') {
1284 /* yet another special case */
1285 strcpy(buf, "[\\\\\n]");
1286 } else if (fs->stptr[0] != '\n')
1287 sprintf(buf, "[%c\n]", fs->stptr[0]);
1291 parse_field = posix_def_parse_field;
1293 parse_field = def_parse_field;
1295 if (fs->stlen == 1) {
1296 if (fs->stptr[0] == ' ')
1298 else if (fs->stptr[0] == '\\')
1299 /* same special case */
1300 strcpy(buf, "[\\\\]");
1302 parse_field = sc_parse_field;
1306 refree(FS_re_yes_case);
1307 refree(FS_re_no_case);
1308 FS_re_yes_case = FS_re_no_case = FS_regexp = NULL;
1310 if (buf[0] != '\0') {
1311 FS_re_yes_case = make_regexp(buf, strlen(buf), false, true, true);
1312 FS_re_no_case = make_regexp(buf, strlen(buf), true, true, true);
1313 FS_regexp = (IGNORECASE ? FS_re_no_case : FS_re_yes_case);
1314 parse_field = re_parse_field;
1315 } else if (parse_field == re_parse_field) {
1316 FS_re_yes_case = make_regexp(fs->stptr, fs->stlen, false, true, true);
1317 FS_re_no_case = make_regexp(fs->stptr, fs->stlen, true, true, true);
1318 FS_regexp = (IGNORECASE ? FS_re_no_case : FS_re_yes_case);
1320 FS_re_yes_case = FS_re_no_case = FS_regexp = NULL;
1324 * For FS = "c", we don't use IGNORECASE. But we must use
1325 * re_parse_field to get the character and the newline as
1328 if (fs->stlen == 1 && parse_field == re_parse_field)
1329 FS_regexp = FS_re_yes_case;
1331 update_PROCINFO_str("FS", "FS");
1334 /* current_field_sep --- return what field separator is */
1339 if (parse_field == fw_parse_field)
1340 return Using_FIELDWIDTHS;
1341 else if (parse_field == fpat_parse_field)
1347 /* update_PROCINFO_str --- update PROCINFO[sub] with string value */
1350 update_PROCINFO_str(const char *subscript, const char *str)
1355 if (PROCINFO_node == NULL)
1357 tmp = make_string(subscript, strlen(subscript));
1358 aptr = assoc_lookup(PROCINFO_node, tmp);
1361 *aptr = make_string(str, strlen(str));
1364 /* update_PROCINFO_num --- update PROCINFO[sub] with numeric value */
1367 update_PROCINFO_num(const char *subscript, AWKNUM val)
1372 if (PROCINFO_node == NULL)
1374 tmp = make_string(subscript, strlen(subscript));
1375 aptr = assoc_lookup(PROCINFO_node, tmp);
1378 *aptr = make_number(val);
1381 /* set_FPAT --- handle an assignment to FPAT */
1386 static bool warned = false;
1387 static NODE *save_fpat = NULL;
1388 bool remake_re = true;
1391 if (do_lint && ! warned) {
1393 lintwarn(_("`FPAT' is a gawk extension"));
1395 if (do_traditional) /* quick and dirty, does the trick */
1399 * If changing the way fields are split, obey least-suprise
1400 * semantics, and force $0 to be split totally.
1402 if (fields_arr != NULL)
1403 (void) get_field(UNLIMITED - 1, 0);
1405 /* It's possible that only IGNORECASE changed, or FPAT = FPAT */
1407 * This comparison can't use cmp_nodes(), which pays attention
1408 * to IGNORECASE, and that's not what we want.
1411 && FPAT_node->var_value->stlen == save_fpat->stlen
1412 && memcmp(FPAT_node->var_value->stptr, save_fpat->stptr, save_fpat->stlen) == 0) {
1413 if (FPAT_regexp != NULL)
1414 FPAT_regexp = (IGNORECASE ? FPAT_re_no_case : FPAT_re_yes_case);
1417 if (current_field_sep() == Using_FPAT) {
1421 goto set_fpat_function;
1426 save_fpat = dupnode(FPAT_node->var_value);
1427 refree(FPAT_re_yes_case);
1428 refree(FPAT_re_no_case);
1429 FPAT_re_yes_case = FPAT_re_no_case = FPAT_regexp = NULL;
1432 fpat = force_string(FPAT_node->var_value);
1433 parse_field = fpat_parse_field;
1436 refree(FPAT_re_yes_case);
1437 refree(FPAT_re_no_case);
1438 FPAT_re_yes_case = FPAT_re_no_case = FPAT_regexp = NULL;
1440 FPAT_re_yes_case = make_regexp(fpat->stptr, fpat->stlen, false, true, true);
1441 FPAT_re_no_case = make_regexp(fpat->stptr, fpat->stlen, true, true, true);
1442 FPAT_regexp = (IGNORECASE ? FPAT_re_no_case : FPAT_re_yes_case);
1445 update_PROCINFO_str("FS", "FPAT");
1449 * increment_scan --- macro to move scan pointer ahead by one character.
1450 * Implementation varies if doing MBS or not.
1454 #define increment_scan(scanp, len) incr_scan(scanp, len, & mbs)
1456 #define increment_scan(scanp, len) ((*scanp)++)
1460 /* incr_scan --- MBS version of increment_scan() */
1463 incr_scan(char **scanp, size_t len, mbstate_t *mbs)
1467 if (gawk_mb_cur_max > 1) {
1468 mbclen = mbrlen(*scanp, len, mbs);
1470 || (mbclen == (size_t) -1)
1471 || (mbclen == (size_t) -2)
1473 /* We treat it as a singlebyte character. */
1483 * fpat_parse_field --- parse fields using a regexp.
1485 * This is called both from get_field() and from do_patsplit()
1486 * via (*parse_field)(). This variation is for when FPAT is a regular
1487 * expression -- use the value to find field contents.
1489 * This was really hard to get right. It happens to bear many resemblances
1490 * to issues I had with getting gsub right with null matches. When dealing
1491 * with that I prototyped in awk and had the foresight to save the awk code
1492 * over in the C file. Starting with that as a base, I finally got to this
1493 * awk code to do what I needed, and then translated it into C. Fortunately
1494 * the C code bears a closer correspondance to the awk code here than over
1501 * fpat[1] = "([^,]*)|(\"[^\"]+\")"
1507 * data[1] = "Robbins,,Arnold,"
1508 * data[2] = "Smith,,\"1234 A Pretty Place, NE\",Sometown,NY,12345-6789,USA"
1509 * data[3] = "Robbins,Arnold,\"1234 A Pretty Place, NE\",Sometown,NY,12345-6789,USA"
1510 * data[4] = "bbbaaacccdddaaaaaqqqq"
1511 * data[5] = "bbbaaacccdddaaaaaqqqqa" # should get trailing qqqa
1513 * for (i = 1; i in data; i++) {
1514 * printf("Splitting: <%s>\n", data[i])
1515 * n = mypatsplit(data[i], fields, fpat[i], seps)
1517 * for (j = 1; j <= n; j++)
1518 * printf("fields[%d] = <%s>\n", j, fields[j])
1519 * for (j = 0; j in seps; j++)
1520 * printf("seps[%s] = <%s>\n", j, seps[j])
1524 * function mypatsplit(string, array, pattern, seps,
1525 * eosflag, non_empty, nf) # locals
1529 * if (length(string) == 0)
1532 * eosflag = non_empty = false
1534 * while (match(string, pattern)) {
1535 * if (RLENGTH > 0) { # easy case
1537 * if (! (nf in seps)) {
1538 * if (RSTART == 1) # match at front of string
1541 * seps[nf] = substr(string, 1, RSTART - 1)
1543 * array[++nf] = substr(string, RSTART, RLENGTH)
1544 * string = substr(string, RSTART+RLENGTH)
1545 * if (length(string) == 0)
1547 * } else if (non_empty) {
1548 * # last match was non-empty, and at the
1549 * # current character we get a zero length match,
1550 * # which we don't want, so skip over it
1552 * seps[nf] = substr(string, 1, 1)
1553 * string = substr(string, 2)
1556 * if (! (nf in seps)) {
1560 * seps[nf] = substr(string, 1, RSTART - 1)
1563 * if (! non_empty && ! eosflag) { # prev was empty
1564 * seps[nf] = substr(string, 1, 1)
1566 * if (RSTART == 1) {
1567 * string = substr(string, 2)
1569 * string = substr(string, RSTART + 1)
1573 * if (length(string) == 0) {
1580 * if (length(string) > 0)
1583 * return length(array)
1587 fpat_parse_field(long up_to, /* parse only up to this field number */
1588 char **buf, /* on input: string to parse; on output: point to start next */
1590 NODE *fs ATTRIBUTE_UNUSED,
1592 Setfunc set, /* routine to set the value of the parsed field */
1594 NODE *sep_arr, /* array of field separators (may be NULL) */
1598 long nf = parse_high_water;
1600 char *end = scan + len;
1601 int regex_flags = RE_NEED_START;
1602 bool need_to_set_sep;
1608 if (gawk_mb_cur_max > 1)
1609 memset(&mbs, 0, sizeof(mbstate_t));
1612 if (up_to == UNLIMITED)
1618 if (rp == NULL) /* use FPAT */
1622 regex_flags |= RE_NO_BOL;
1623 non_empty = rp->non_empty;
1628 need_to_set_sep = true;
1630 while (research(rp, scan, 0, (end - scan), regex_flags) != -1
1633 if (REEND(rp, scan) > RESTART(rp, scan)) { /* if (RLENGTH > 0) */
1635 if (sep_arr != NULL && need_to_set_sep) {
1636 if (RESTART(rp, scan) == 0) /* match at front */
1637 set_element(nf, start, 0L, sep_arr);
1641 (long) RESTART(rp, scan),
1644 /* field is text that matched */
1646 scan + RESTART(rp, scan),
1647 (long)(REEND(rp, scan) - RESTART(rp, scan)),
1650 scan += REEND(rp, scan);
1653 need_to_set_sep = true;
1654 } else if (non_empty) { /* else if non_empty */
1656 * last match was non-empty, and at the
1657 * current character we get a zero length match,
1658 * which we don't want, so skip over it
1661 if (sep_arr != NULL) {
1662 need_to_set_sep = false;
1663 set_element(nf, start, 1L, sep_arr);
1665 increment_scan(& scan, end - scan);
1667 /* 0 length match */
1668 if (sep_arr != NULL && need_to_set_sep) {
1669 if (RESTART(rp, scan) == 0) /* RSTART == 1 */
1670 set_element(nf, start, 0L, sep_arr);
1672 set_element(nf, start,
1673 (long) RESTART(rp, scan),
1676 need_to_set_sep = true;
1677 (*set)(++nf, scan, 0L, n);
1678 if (! non_empty && ! eosflag) { /* prev was empty */
1679 if (sep_arr != NULL) {
1680 set_element(nf, start, 1L, sep_arr);
1681 need_to_set_sep = false;
1684 if (RESTART(rp, scan) == 0)
1685 increment_scan(& scan, end - scan);
1687 scan += RESTART(rp, scan);
1691 if (scan >= end) { /* length(string) == 0 */
1701 if (sep_arr != NULL)
1702 set_element(nf, scan, (long) (end - scan), sep_arr);
1706 rp->non_empty = non_empty;