2 * field.c - routines for dealing with fields and record parsing
6 * Copyright (C) 1986, 1988, 1989, 1991-2011 the Free Software Foundation, Inc.
8 * This file is part of GAWK, the GNU implementation of the
9 * AWK Programming Language.
11 * GAWK is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 3 of the License, or
14 * (at your option) any later version.
16 * GAWK is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
29 * In case that the system doesn't have isblank().
30 * Don't bother with autoconf ifdef junk, just force it.
31 * See dfa.c and regex_internal.h and regcomp.c. Bleah.
36 return c == ' ' || c == '\t';
39 typedef void (* Setfunc)(long, char *, long, NODE *);
41 static long (*parse_field)(long, char **, int, NODE *,
42 Regexp *, Setfunc, NODE *, NODE *, int);
43 static void rebuild_record(void);
44 static long re_parse_field(long, char **, int, NODE *,
45 Regexp *, Setfunc, NODE *, NODE *, int);
46 static long def_parse_field(long, char **, int, NODE *,
47 Regexp *, Setfunc, NODE *, NODE *, int);
48 static long posix_def_parse_field(long, char **, int, NODE *,
49 Regexp *, Setfunc, NODE *, NODE *, int);
50 static long null_parse_field(long, char **, int, NODE *,
51 Regexp *, Setfunc, NODE *, NODE *, int);
52 static long sc_parse_field(long, char **, int, NODE *,
53 Regexp *, Setfunc, NODE *, NODE *, int);
54 static long fw_parse_field(long, char **, int, NODE *,
55 Regexp *, Setfunc, NODE *, NODE *, int);
56 static long fpat_parse_field(long, char **, int, NODE *,
57 Regexp *, Setfunc, NODE *, NODE *, int);
58 static void set_element(long num, char * str, long len, NODE *arr);
59 static void grow_fields_arr(long num);
60 static void set_field(long num, char *str, long len, NODE *dummy);
62 static char *parse_extent; /* marks where to restart parse of record */
63 static long parse_high_water = 0; /* field number that we have parsed so far */
64 static long nf_high_water = 0; /* size of fields_arr */
66 static NODE *save_FS; /* save current value of FS when line is read,
67 * to be used in deferred parsing
69 static int *FIELDWIDTHS = NULL;
71 NODE **fields_arr; /* array of pointers to the field nodes */
72 int field0_valid; /* $(>0) has not been changed yet */
73 int default_FS; /* TRUE when FS == " " */
74 Regexp *FS_re_yes_case = NULL;
75 Regexp *FS_re_no_case = NULL;
76 Regexp *FS_regexp = NULL;
77 Regexp *FPAT_re_yes_case = NULL;
78 Regexp *FPAT_re_no_case = NULL;
79 Regexp *FPAT_regexp = NULL;
80 NODE *Null_field = NULL;
82 /* init_fields --- set up the fields array to start with */
87 emalloc(fields_arr, NODE **, sizeof(NODE *), "init_fields");
88 fields_arr[0] = Nnull_string;
89 parse_extent = fields_arr[0]->stptr;
90 save_FS = dupnode(FS_node->var_value);
92 *Null_field = *Nnull_string;
93 Null_field->flags |= FIELD;
94 Null_field->flags &= ~(NUMCUR|NUMBER|MAYBE_NUM|PERM|MALLOC);
98 /* grow_fields --- acquire new fields as needed */
101 grow_fields_arr(long num)
106 erealloc(fields_arr, NODE **, (num + 1) * sizeof(NODE *), "grow_fields_arr");
107 for (t = nf_high_water + 1; t <= num; t++) {
115 /* set_field --- set the value of a particular field */
122 NODE *dummy ATTRIBUTE_UNUSED) /* just to make interface same as set_element */
126 if (num > nf_high_water)
127 grow_fields_arr(num);
131 n->flags = (STRCUR|STRING|MAYBE_NUM|FIELD);
134 /* rebuild_record --- Someone assigned a value to $(something).
135 Fix up $0 to be right */
141 * use explicit unsigned longs for lengths, in case
142 * a size_t isn't big enough.
145 unsigned long ofslen;
155 ofs = force_string(OFS_node->var_value);
157 for (i = NF; i > 0; i--) {
159 tmp = force_string(tmp);
162 tlen += (NF - 1) * ofslen;
165 emalloc(ops, char *, tlen + 2, "rebuild_record");
168 for (i = 1; i <= NF; i++) {
169 free_wstr(fields_arr[i]);
173 *cops++ = tmp->stptr[0];
174 else if (tmp->stlen != 0) {
175 memcpy(cops, tmp->stptr, tmp->stlen);
181 *cops++ = ofs->stptr[0];
182 else if (ofslen != 0) {
183 memcpy(cops, ofs->stptr, ofslen);
188 tmp = make_str_node(ops, tlen, ALREADY_MALLOCED);
191 * Since we are about to unref fields_arr[0], we want to find
192 * any fields that still point into it, and have them point
193 * into the new field zero. This has to be done intelligently,
194 * so that unrefing a field doesn't try to unref into the old $0.
196 for (cops = ops, i = 1; i <= NF; i++) {
197 if (fields_arr[i]->stlen > 0) {
201 if ((fields_arr[i]->flags & FIELD) == 0) {
203 n->stlen = fields_arr[i]->stlen;
204 if ((fields_arr[i]->flags & (NUMCUR|NUMBER)) != 0) {
205 n->flags |= (fields_arr[i]->flags & (NUMCUR|NUMBER));
206 n->numbr = fields_arr[i]->numbr;
209 *n = *(fields_arr[i]);
210 n->flags &= ~(MALLOC|PERM|STRING);
214 unref(fields_arr[i]);
216 assert((n->flags & WSTRCUR) == 0);
218 cops += fields_arr[i]->stlen + ofslen;
221 unref(fields_arr[0]);
229 * setup $0, but defer parsing rest of line until reference is made to $(>0)
230 * or to NF. At that point, parse only as much as necessary.
232 * Manage a private buffer for the contents of $0. Doing so keeps us safe
233 * if `getline var' decides to rearrange the contents of the IOBUF that
234 * $0 might have been pointing into. The cost is the copying of the buffer;
235 * but better correct than fast.
238 set_record(const char *buf, int cnt)
241 static char *databuf;
242 static unsigned long databuf_size;
243 #define INITIAL_SIZE 512
244 #define MAX_SIZE ((unsigned long) ~0) /* maximally portable ... */
248 /* buffer management: */
249 if (databuf_size == 0) { /* first time */
250 emalloc(databuf, char *, INITIAL_SIZE, "set_record");
251 databuf_size = INITIAL_SIZE;
252 memset(databuf, '\0', INITIAL_SIZE);
256 * Make sure there's enough room. Since we sometimes need
257 * to place a sentinel at the end, we make sure
258 * databuf_size is > cnt after allocation.
260 if (cnt >= databuf_size) {
261 while (cnt >= databuf_size && databuf_size <= MAX_SIZE)
263 erealloc(databuf, char *, databuf_size, "set_record");
264 memset(databuf, '\0', databuf_size);
267 memcpy(databuf, buf, cnt);
269 /* manage field 0: */
270 unref(fields_arr[0]);
277 n->flags = (STRING|STRCUR|MAYBE_NUM|FIELD);
284 /* reset_record --- start over again with current $0 */
292 (void) force_string(fields_arr[0]);
295 for (i = 1; i <= parse_high_water; i++) {
296 unref(fields_arr[i]);
302 parse_high_water = 0;
304 * $0 = $0 should resplit using the current value of FS.
309 save_FS = dupnode(FS_node->var_value);
315 /* set_NF --- handle what happens to $0 and fields when NF is changed */
326 nf = (long) force_number(NF_node->var_value);
328 fatal(_("NF set to negative value"));
331 if (NF > nf_high_water)
333 if (parse_high_water < NF) {
334 for (i = parse_high_water + 1; i >= 0 && i <= NF; i++) {
335 unref(fields_arr[i]);
340 } else if (parse_high_water > 0) {
341 for (i = NF + 1; i >= 0 && i <= parse_high_water; i++) {
342 unref(fields_arr[i]);
347 parse_high_water = NF;
349 field0_valid = FALSE;
353 * re_parse_field --- parse fields using a regexp.
355 * This is called both from get_field() and from do_split()
356 * via (*parse_field)(). This variation is for when FS is a regular
357 * expression -- either user-defined or because RS=="" and FS==" "
360 re_parse_field(long up_to, /* parse only up to this field number */
361 char **buf, /* on input: string to parse; on output: point to start next */
363 NODE *fs ATTRIBUTE_UNUSED,
365 Setfunc set, /* routine to set the value of the parsed field */
367 NODE *sep_arr, /* array of field separators (maybe NULL) */
371 long nf = parse_high_water;
373 char *end = scan + len;
374 int regex_flags = RE_NEED_START;
379 if (gawk_mb_cur_max > 1)
380 memset(&mbs, 0, sizeof(mbstate_t));
384 regex_flags |= RE_NO_BOL;
386 if (up_to == UNLIMITED)
391 if (RS_is_null && default_FS) {
393 while (scan < end && (*scan == ' ' || *scan == '\t' || *scan == '\n'))
395 if (sep_arr != NULL && sep < scan)
396 set_element(nf, sep, (long)(scan - sep), sep_arr);
399 if (rp == NULL) /* use FS */
404 && research(rp, scan, 0, (end - scan), regex_flags) != -1
406 regex_flags |= RE_NO_BOL;
407 if (REEND(rp, scan) == RESTART(rp, scan)) { /* null match */
409 if (gawk_mb_cur_max > 1) {
410 mbclen = mbrlen(scan, end-scan, &mbs);
411 if ((mbclen == 1) || (mbclen == (size_t) -1)
412 || (mbclen == (size_t) -2) || (mbclen == 0)) {
413 /* We treat it as a singlebyte character. */
421 (*set)(++nf, field, (long)(scan - field), n);
428 (long)(scan + RESTART(rp, scan) - field), n);
430 set_element(nf, scan + RESTART(rp, scan),
431 (long) (REEND(rp, scan) - RESTART(rp, scan)), sep_arr);
432 scan += REEND(rp, scan);
434 if (scan == end) /* FS at end of record */
435 (*set)(++nf, field, 0L, n);
437 if (nf != up_to && scan < end) {
438 (*set)(++nf, scan, (long)(end - scan), n);
446 * def_parse_field --- default field parsing.
448 * This is called both from get_field() and from do_split()
449 * via (*parse_field)(). This variation is for when FS is a single space
454 def_parse_field(long up_to, /* parse only up to this field number */
455 char **buf, /* on input: string to parse; on output: point to start next */
458 Regexp *rp ATTRIBUTE_UNUSED,
459 Setfunc set, /* routine to set the value of the parsed field */
461 NODE *sep_arr, /* array of field separators (maybe NULL) */
462 int in_middle ATTRIBUTE_UNUSED)
465 long nf = parse_high_water;
467 char *end = scan + len;
471 if (up_to == UNLIMITED)
477 * Nasty special case. If FS set to "", return whole record
478 * as first field. This is not worth a separate function.
480 if (fs->stlen == 0) {
481 (*set)(++nf, *buf, len, n);
486 /* before doing anything save the char at *end */
488 /* because it will be destroyed now: */
490 *end = ' '; /* sentinel character */
492 for (; nf < up_to; scan++) {
494 * special case: fs is single space, strip leading whitespace
496 while (scan < end && (*scan == ' ' || *scan == '\t' || *scan == '\n'))
499 if (sep_arr != NULL && scan > sep)
500 set_element(nf, sep, (long) (scan - sep), sep_arr);
507 while (*scan != ' ' && *scan != '\t' && *scan != '\n')
510 (*set)(++nf, field, (long)(scan - field), n);
518 /* everything done, restore original char at *end */
526 * posix_def_parse_field --- default field parsing.
528 * This is called both from get_field() and from do_split()
529 * via (*parse_field)(). This variation is for when FS is a single space
530 * character. The only difference between this and def_parse_field()
531 * is that this one does not allow newlines to separate fields.
535 posix_def_parse_field(long up_to, /* parse only up to this field number */
536 char **buf, /* on input: string to parse; on output: point to start next */
539 Regexp *rp ATTRIBUTE_UNUSED,
540 Setfunc set, /* routine to set the value of the parsed field */
542 NODE *dummy ATTRIBUTE_UNUSED, /* sep_arr not needed here: hence dummy */
543 int in_middle ATTRIBUTE_UNUSED)
546 long nf = parse_high_water;
548 char *end = scan + len;
551 if (up_to == UNLIMITED)
557 * Nasty special case. If FS set to "", return whole record
558 * as first field. This is not worth a separate function.
560 if (fs->stlen == 0) {
561 (*set)(++nf, *buf, len, n);
566 /* before doing anything save the char at *end */
568 /* because it will be destroyed now: */
570 *end = ' '; /* sentinel character */
571 for (; nf < up_to; scan++) {
573 * special case: fs is single space, strip leading whitespace
575 while (scan < end && (*scan == ' ' || *scan == '\t'))
580 while (*scan != ' ' && *scan != '\t')
582 (*set)(++nf, field, (long)(scan - field), n);
587 /* everything done, restore original char at *end */
595 * null_parse_field --- each character is a separate field
597 * This is called both from get_field() and from do_split()
598 * via (*parse_field)(). This variation is for when FS is the null string.
601 null_parse_field(long up_to, /* parse only up to this field number */
602 char **buf, /* on input: string to parse; on output: point to start next */
604 NODE *fs ATTRIBUTE_UNUSED,
605 Regexp *rp ATTRIBUTE_UNUSED,
606 Setfunc set, /* routine to set the value of the parsed field */
608 NODE *sep_arr, /* array of field separators (maybe NULL) */
609 int in_middle ATTRIBUTE_UNUSED)
612 long nf = parse_high_water;
613 char *end = scan + len;
615 if (up_to == UNLIMITED)
621 if (gawk_mb_cur_max > 1) {
623 memset(&mbs, 0, sizeof(mbstate_t));
624 for (; nf < up_to && scan < end;) {
625 size_t mbclen = mbrlen(scan, end-scan, &mbs);
626 if ((mbclen == 1) || (mbclen == (size_t) -1)
627 || (mbclen == (size_t) -2) || (mbclen == 0)) {
628 /* We treat it as a singlebyte character. */
631 if (sep_arr != NULL && nf > 0)
632 set_element(nf, scan, 0L, sep_arr);
633 (*set)(++nf, scan, mbclen, n);
638 for (; nf < up_to && scan < end; scan++) {
639 if (sep_arr != NULL && nf > 0)
640 set_element(nf, scan, 0L, sep_arr);
641 (*set)(++nf, scan, 1L, n);
649 * sc_parse_field --- single character field separator
651 * This is called both from get_field() and from do_split()
652 * via (*parse_field)(). This variation is for when FS is a single character
656 sc_parse_field(long up_to, /* parse only up to this field number */
657 char **buf, /* on input: string to parse; on output: point to start next */
660 Regexp *rp ATTRIBUTE_UNUSED,
661 Setfunc set, /* routine to set the value of the parsed field */
663 NODE *sep_arr, /* array of field separators (maybe NULL) */
664 int in_middle ATTRIBUTE_UNUSED)
668 long nf = parse_high_water;
670 char *end = scan + len;
675 if (gawk_mb_cur_max > 1)
676 memset(&mbs, 0, sizeof(mbstate_t));
679 if (up_to == UNLIMITED)
684 if (RS_is_null && fs->stlen == 0)
687 fschar = fs->stptr[0];
689 /* before doing anything save the char at *end */
691 /* because it will be destroyed now: */
692 *end = fschar; /* sentinel character */
694 for (; nf < up_to;) {
697 if (gawk_mb_cur_max > 1) {
698 while (*scan != fschar) {
699 mbclen = mbrlen(scan, end-scan, &mbs);
700 if ((mbclen == 1) || (mbclen == (size_t) -1)
701 || (mbclen == (size_t) -2) || (mbclen == 0)) {
702 /* We treat it as a singlebyte character. */
709 while (*scan != fschar)
711 (*set)(++nf, field, (long)(scan - field), n);
715 set_element(nf, scan, 1L, sep_arr);
717 if (scan == end) { /* FS at end of record */
718 (*set)(++nf, field, 0L, n);
723 /* everything done, restore original char at *end */
731 * fw_parse_field --- field parsing using FIELDWIDTHS spec
733 * This is called from get_field() via (*parse_field)().
734 * This variation is for fields are fixed widths.
737 fw_parse_field(long up_to, /* parse only up to this field number */
738 char **buf, /* on input: string to parse; on output: point to start next */
740 NODE *fs ATTRIBUTE_UNUSED,
741 Regexp *rp ATTRIBUTE_UNUSED,
742 Setfunc set, /* routine to set the value of the parsed field */
744 NODE *dummy ATTRIBUTE_UNUSED, /* sep_arr not needed here: hence dummy */
745 int in_middle ATTRIBUTE_UNUSED)
748 long nf = parse_high_water;
749 char *end = scan + len;
758 memset(&mbs, 0, sizeof(mbstate_t));
761 if (up_to == UNLIMITED)
765 for (; nf < up_to && (len = FIELDWIDTHS[nf+1]) != -1; ) {
767 if (gawk_mb_cur_max > 1) {
771 lenrest = end - scan;
772 while (nmbc < len && mbslen < lenrest) {
773 mbclen = mbrlen(mbscan, end - mbscan, &mbs);
775 || mbclen == (size_t) -1
776 || mbclen == (size_t) -2
778 /* We treat it as a singlebyte character. */
781 if (mbclen <= end - mbscan) {
787 (*set)(++nf, scan, (long) mbslen, n);
793 if (len > end - scan)
795 (*set)(++nf, scan, (long) len, n);
806 /* invalidate_field0 --- $0 needs reconstruction */
811 field0_valid = FALSE;
814 /* get_field --- return a particular $n */
816 /* assign is not NULL if this field is on the LHS of an assign */
819 get_field(long requested, Func_ptr *assign)
821 int in_middle = FALSE;
823 * if requesting whole line but some other field has been altered,
824 * then the whole line must be rebuilt
826 if (requested == 0) {
827 if (! field0_valid) {
828 /* first, parse remainder of input record */
830 NF = (*parse_field)(UNLIMITED - 1, &parse_extent,
831 fields_arr[0]->stlen -
832 (parse_extent - fields_arr[0]->stptr),
833 save_FS, FS_regexp, set_field,
837 parse_high_water = NF;
842 *assign = reset_record;
843 return &fields_arr[0];
846 /* assert(requested > 0); */
850 field0_valid = FALSE; /* $0 needs reconstruction */
853 * Keep things uniform. Also, mere intention of assigning something
854 * to $n should not make $0 invalid. Makes sense to invalidate $0
855 * after the actual assignment is performed. Not a real issue in
856 * the interpreter otherwise, but causes problem in the
857 * debugger when watching or printing fields.
861 *assign = invalidate_field0; /* $0 needs reconstruction */
864 if (requested <= parse_high_water) /* already parsed this field */
865 return &fields_arr[requested];
867 if (NF == -1) { /* have not yet parsed to end of record */
869 * parse up to requested fields, calling set_field() for each,
870 * saving in parse_extent the point where the parse left off
872 if (parse_high_water == 0) /* starting at the beginning */
873 parse_extent = fields_arr[0]->stptr;
876 parse_high_water = (*parse_field)(requested, &parse_extent,
877 fields_arr[0]->stlen - (parse_extent - fields_arr[0]->stptr),
878 save_FS, NULL, set_field, (NODE *) NULL, (NODE *) NULL, in_middle);
881 * if we reached the end of the record, set NF to the number of
882 * fields so far. Note that requested might actually refer to
883 * a field that is beyond the end of the record, but we won't
884 * set NF to that value at this point, since this is only a
885 * reference to the field and NF only gets set if the field
886 * is assigned to -- this case is handled below
888 if (parse_extent == fields_arr[0]->stptr + fields_arr[0]->stlen)
889 NF = parse_high_water;
890 else if (parse_field == fpat_parse_field) {
891 /* FPAT parsing is wierd, isolate the special cases */
892 char *rec_start = fields_arr[0]->stptr;
893 char *rec_end = fields_arr[0]->stptr + fields_arr[0]->stlen;
895 if ( parse_extent > rec_end
896 || (parse_extent > rec_start && parse_extent < rec_end && requested == UNLIMITED-1))
897 NF = parse_high_water;
898 else if (parse_extent == rec_start) /* could be no match for FPAT */
901 if (requested == UNLIMITED - 1) /* UNLIMITED-1 means set NF */
902 requested = parse_high_water;
904 if (parse_high_water < requested) { /* requested beyond end of record */
905 if (assign != NULL) { /* expand record */
906 if (requested > nf_high_water)
907 grow_fields_arr(requested);
910 parse_high_water = requested;
915 return &fields_arr[requested];
918 /* set_element --- set an array element, used by do_split() */
921 set_element(long num, char *s, long len, NODE *n)
927 it = make_string(s, len);
928 it->flags |= MAYBE_NUM;
929 sub = make_number((AWKNUM) (num));
930 lhs = assoc_lookup(n, sub, FALSE);
936 /* do_split --- implement split(), semantics are same as for field splitting */
941 NODE *src, *arr, *sep, *fs, *tmp, *sep_arr = NULL;
943 long (*parseit)(long, char **, int, NODE *,
944 Regexp *, Setfunc, NODE *, NODE *, int);
948 static short warned1 = FALSE, warned2 = FALSE;
950 if (do_traditional || do_posix) {
951 fatal(_("split: fourth argument is a gawk extension"));
953 sep_arr = POP_PARAM();
954 if (sep_arr->type != Node_var_array)
955 fatal(_("split: fourth argument is not an array"));
956 if (do_lint && ! warned1) {
958 lintwarn(_("split: fourth argument is a gawk extension"));
960 if (do_lint_old && ! warned2) {
962 warning(_("split: fourth argument is a gawk extension"));
968 if (arr->type != Node_var_array)
969 fatal(_("split: second argument is not an array"));
971 if (sep_arr != NULL) {
973 fatal(_("split: cannot use the same array for second and fourth args"));
975 /* This checks need to be done before clearing any of the arrays */
976 for (tmp = sep_arr->parent_array; tmp != NULL; tmp = tmp->parent_array)
978 fatal(_("split: cannot use a subarray of second arg for fourth arg"));
979 for (tmp = arr->parent_array; tmp != NULL; tmp = tmp->parent_array)
981 fatal(_("split: cannot use a subarray of fourth arg for second arg"));
982 assoc_clear(sep_arr);
987 if (src->stlen == 0) {
989 * Skip the work if first arg is the null string.
993 return make_number((AWKNUM) 0);
996 if ((sep->re_flags & FS_DFLT) != 0 && current_field_sep() != Using_FIELDWIDTHS && ! RS_is_null) {
997 parseit = parse_field;
998 fs = force_string(FS_node->var_value);
1003 if (fs->stlen == 0) {
1004 static short warned = FALSE;
1006 parseit = null_parse_field;
1008 if (do_lint && ! warned) {
1010 lintwarn(_("split: null string for third arg is a gawk extension"));
1012 } else if (fs->stlen == 1 && (sep->re_flags & CONSTANT) == 0) {
1013 if (fs->stptr[0] == ' ') {
1015 parseit = posix_def_parse_field;
1017 parseit = def_parse_field;
1019 parseit = sc_parse_field;
1021 parseit = re_parse_field;
1022 rp = re_update(sep);
1027 tmp = make_number((AWKNUM) (*parseit)(UNLIMITED, &s, (int) src->stlen,
1028 fs, rp, set_element, arr, sep_arr, FALSE));
1036 * do_patsplit --- implement patsplit(), semantics are same as for field
1037 * splitting with FPAT.
1041 do_patsplit(int nargs)
1043 NODE *src, *arr, *sep, *fpat, *tmp, *sep_arr = NULL;
1048 sep_arr = POP_PARAM();
1049 if (sep_arr->type != Node_var_array)
1050 fatal(_("patsplit: fourth argument is not an array"));
1054 if (arr->type != Node_var_array)
1055 fatal(_("patsplit: second argument is not an array"));
1060 if (fpat->stlen == 0)
1061 fatal(_("patsplit: third argument must be non-null"));
1063 if (sep_arr != NULL) {
1065 fatal(_("patsplit: cannot use the same array for second and fourth args"));
1067 /* This checks need to be done before clearing any of the arrays */
1068 for (tmp = sep_arr->parent_array; tmp != NULL; tmp = tmp->parent_array)
1070 fatal(_("patsplit: cannot use a subarray of second arg for fourth arg"));
1071 for (tmp = arr->parent_array; tmp != NULL; tmp = tmp->parent_array)
1073 fatal(_("patsplit: cannot use a subarray of fourth arg for second arg"));
1074 assoc_clear(sep_arr);
1078 if (src->stlen == 0) {
1080 * Skip the work if first arg is the null string.
1082 tmp = make_number((AWKNUM) 0);
1084 rp = re_update(sep);
1086 tmp = make_number((AWKNUM) fpat_parse_field(UNLIMITED, &s,
1087 (int) src->stlen, fpat, rp,
1088 set_element, arr, sep_arr, FALSE));
1091 decr_sp(); /* 1st argument not POP-ed */
1096 /* set_FIELDWIDTHS --- handle an assignment to FIELDWIDTHS */
1104 static int fw_alloc = 4;
1105 static short warned = FALSE;
1106 int fatal_error = FALSE;
1108 if (do_lint && ! warned) {
1110 lintwarn(_("`FIELDWIDTHS' is a gawk extension"));
1112 if (do_traditional) /* quick and dirty, does the trick */
1116 * If changing the way fields are split, obey least-suprise
1117 * semantics, and force $0 to be split totally.
1119 if (fields_arr != NULL)
1120 (void) get_field(UNLIMITED - 1, 0);
1122 parse_field = fw_parse_field;
1123 scan = force_string(FIELDWIDTHS_node->var_value)->stptr;
1125 if (FIELDWIDTHS == NULL)
1126 emalloc(FIELDWIDTHS, int *, fw_alloc * sizeof(int), "set_FIELDWIDTHS");
1128 for (i = 1; ; i++) {
1129 unsigned long int tmp;
1130 if (i + 2 >= fw_alloc) {
1132 erealloc(FIELDWIDTHS, int *, fw_alloc * sizeof(int), "set_FIELDWIDTHS");
1134 /* Initialize value to be end of list */
1135 FIELDWIDTHS[i] = -1;
1136 /* Ensure that there is no leading `-' sign. Otherwise,
1137 strtoul would accept it and return a bogus result. */
1138 while (is_blank(*scan)) {
1148 /* Detect an invalid base-10 integer, a valid value that
1149 is followed by something other than a blank or '\0',
1150 or a value that is not in the range [1..INT_MAX]. */
1152 tmp = strtoul(scan, &end, 10);
1154 || (*end != '\0' && ! is_blank(*end))
1155 || !(0 < tmp && tmp <= INT_MAX)
1160 FIELDWIDTHS[i] = tmp;
1162 /* Skip past any trailing blanks. */
1163 while (is_blank(*scan)) {
1169 FIELDWIDTHS[i+1] = -1;
1171 update_PROCINFO_str("FS", "FIELDWIDTHS");
1173 fatal(_("invalid FIELDWIDTHS value, near `%s'"),
1177 /* set_FS --- handle things when FS is assigned to */
1184 static NODE *save_fs = NULL;
1185 static NODE *save_rs = NULL;
1186 int remake_re = TRUE;
1189 * If changing the way fields are split, obey least-surprise
1190 * semantics, and force $0 to be split totally.
1192 if (fields_arr != NULL)
1193 (void) get_field(UNLIMITED - 1, 0);
1195 /* It's possible that only IGNORECASE changed, or FS = FS */
1197 * This comparison can't use cmp_nodes(), which pays attention
1198 * to IGNORECASE, and that's not what we want.
1201 && FS_node->var_value->stlen == save_fs->stlen
1202 && memcmp(FS_node->var_value->stptr, save_fs->stptr, save_fs->stlen) == 0
1204 && RS_node->var_value->stlen == save_rs->stlen
1205 && memcmp(RS_node->var_value->stptr, save_rs->stptr, save_rs->stlen) == 0) {
1206 if (FS_regexp != NULL)
1207 FS_regexp = (IGNORECASE ? FS_re_no_case : FS_re_yes_case);
1210 if (current_field_sep() == Using_FS) {
1214 goto choose_fs_function;
1219 save_fs = dupnode(FS_node->var_value);
1221 save_rs = dupnode(RS_node->var_value);
1224 /* If FS_re_no_case assignment is fatal (make_regexp in remake_re)
1225 * FS_regexp will be NULL with a non-null FS_re_yes_case.
1226 * refree() handles null argument; no need for `if (FS_regexp != NULL)' below.
1227 * Please do not remerge.
1229 refree(FS_re_yes_case);
1230 refree(FS_re_no_case);
1231 FS_re_yes_case = FS_re_no_case = FS_regexp = NULL;
1237 fs = force_string(FS_node->var_value);
1239 if (! do_traditional && fs->stlen == 0) {
1240 static short warned = FALSE;
1242 parse_field = null_parse_field;
1244 if (do_lint && ! warned) {
1246 lintwarn(_("null string for `FS' is a gawk extension"));
1248 } else if (fs->stlen > 1) {
1250 warning(_("old awk does not support regexps as value of `FS'"));
1251 parse_field = re_parse_field;
1252 } else if (RS_is_null) {
1253 /* we know that fs->stlen <= 1 */
1254 parse_field = sc_parse_field;
1255 if (fs->stlen == 1) {
1256 if (fs->stptr[0] == ' ') {
1258 strcpy(buf, "[ \t\n]+");
1259 } else if (fs->stptr[0] == '\\') {
1260 /* yet another special case */
1261 strcpy(buf, "[\\\\\n]");
1262 } else if (fs->stptr[0] != '\n')
1263 sprintf(buf, "[%c\n]", fs->stptr[0]);
1267 parse_field = posix_def_parse_field;
1269 parse_field = def_parse_field;
1271 if (fs->stlen == 1) {
1272 if (fs->stptr[0] == ' ')
1274 else if (fs->stptr[0] == '\\')
1275 /* same special case */
1276 strcpy(buf, "[\\\\]");
1278 parse_field = sc_parse_field;
1282 refree(FS_re_yes_case);
1283 refree(FS_re_no_case);
1284 FS_re_yes_case = FS_re_no_case = FS_regexp = NULL;
1286 if (buf[0] != '\0') {
1287 FS_re_yes_case = make_regexp(buf, strlen(buf), FALSE, TRUE, TRUE);
1288 FS_re_no_case = make_regexp(buf, strlen(buf), TRUE, TRUE, TRUE);
1289 FS_regexp = (IGNORECASE ? FS_re_no_case : FS_re_yes_case);
1290 parse_field = re_parse_field;
1291 } else if (parse_field == re_parse_field) {
1292 FS_re_yes_case = make_regexp(fs->stptr, fs->stlen, FALSE, TRUE, TRUE);
1293 FS_re_no_case = make_regexp(fs->stptr, fs->stlen, TRUE, TRUE, TRUE);
1294 FS_regexp = (IGNORECASE ? FS_re_no_case : FS_re_yes_case);
1296 FS_re_yes_case = FS_re_no_case = FS_regexp = NULL;
1300 * For FS = "c", we don't use IGNORECASE. But we must use
1301 * re_parse_field to get the character and the newline as
1304 if (fs->stlen == 1 && parse_field == re_parse_field)
1305 FS_regexp = FS_re_yes_case;
1307 update_PROCINFO_str("FS", "FS");
1310 /* current_field_sep --- return what field separator is */
1315 if (parse_field == fw_parse_field)
1316 return Using_FIELDWIDTHS;
1317 else if (parse_field == fpat_parse_field)
1323 /* update_PROCINFO_str --- update PROCINFO[sub] with string value */
1326 update_PROCINFO_str(const char *subscript, const char *str)
1331 if (PROCINFO_node == NULL)
1333 tmp = make_string(subscript, strlen(subscript));
1334 aptr = assoc_lookup(PROCINFO_node, tmp, FALSE);
1337 *aptr = make_string(str, strlen(str));
1340 /* update_PROCINFO_num --- update PROCINFO[sub] with numeric value */
1343 update_PROCINFO_num(const char *subscript, AWKNUM val)
1348 if (PROCINFO_node == NULL)
1350 tmp = make_string(subscript, strlen(subscript));
1351 aptr = assoc_lookup(PROCINFO_node, tmp, FALSE);
1354 *aptr = make_number(val);
1357 /* set_FPAT --- handle an assignment to FPAT */
1362 static short warned = FALSE;
1363 static NODE *save_fpat = NULL;
1364 int remake_re = TRUE;
1367 if (do_lint && ! warned) {
1369 lintwarn(_("`FPAT' is a gawk extension"));
1371 if (do_traditional) /* quick and dirty, does the trick */
1375 * If changing the way fields are split, obey least-suprise
1376 * semantics, and force $0 to be split totally.
1378 if (fields_arr != NULL)
1379 (void) get_field(UNLIMITED - 1, 0);
1381 /* It's possible that only IGNORECASE changed, or FPAT = FPAT */
1383 * This comparison can't use cmp_nodes(), which pays attention
1384 * to IGNORECASE, and that's not what we want.
1387 && FPAT_node->var_value->stlen == save_fpat->stlen
1388 && memcmp(FPAT_node->var_value->stptr, save_fpat->stptr, save_fpat->stlen) == 0) {
1389 if (FPAT_regexp != NULL)
1390 FPAT_regexp = (IGNORECASE ? FPAT_re_no_case : FPAT_re_yes_case);
1393 if (current_field_sep() == Using_FPAT) {
1397 goto set_fpat_function;
1402 save_fpat = dupnode(FPAT_node->var_value);
1403 refree(FPAT_re_yes_case);
1404 refree(FPAT_re_no_case);
1405 FPAT_re_yes_case = FPAT_re_no_case = FPAT_regexp = NULL;
1408 fpat = force_string(FPAT_node->var_value);
1409 parse_field = fpat_parse_field;
1412 refree(FPAT_re_yes_case);
1413 refree(FPAT_re_no_case);
1414 FPAT_re_yes_case = FPAT_re_no_case = FPAT_regexp = NULL;
1416 FPAT_re_yes_case = make_regexp(fpat->stptr, fpat->stlen, FALSE, TRUE, TRUE);
1417 FPAT_re_no_case = make_regexp(fpat->stptr, fpat->stlen, TRUE, TRUE, TRUE);
1418 FPAT_regexp = (IGNORECASE ? FPAT_re_no_case : FPAT_re_yes_case);
1421 update_PROCINFO_str("FS", "FPAT");
1425 * increment_scan --- macro to move scan pointer ahead by one character.
1426 * Implementation varies if doing MBS or not.
1430 #define increment_scan(scanp, len) incr_scan(scanp, len, & mbs)
1432 #define increment_scan(scanp, len) ((*scanp)++)
1436 /* incr_scan --- MBS version of increment_scan() */
1439 incr_scan(char **scanp, size_t len, mbstate_t *mbs)
1443 if (gawk_mb_cur_max > 1) {
1444 mbclen = mbrlen(*scanp, len, mbs);
1446 || (mbclen == (size_t) -1)
1447 || (mbclen == (size_t) -2)
1449 /* We treat it as a singlebyte character. */
1459 * fpat_parse_field --- parse fields using a regexp.
1461 * This is called both from get_field() and from do_patsplit()
1462 * via (*parse_field)(). This variation is for when FPAT is a regular
1463 * expression -- use the value to find field contents.
1465 * This was really hard to get right. It happens to bear many resemblances
1466 * to issues I had with getting gsub right with null matches. When dealing
1467 * with that I prototyped in awk and had the foresight to save the awk code
1468 * over in the C file. Starting with that as a base, I finally got to this
1469 * awk code to do what I needed, and then translated it into C. Fortunately
1470 * the C code bears a closer correspondance to the awk code here than over
1477 * fpat[1] = "([^,]*)|(\"[^\"]+\")"
1483 * data[1] = "Robbins,,Arnold,"
1484 * data[2] = "Smith,,\"1234 A Pretty Place, NE\",Sometown,NY,12345-6789,USA"
1485 * data[3] = "Robbins,Arnold,\"1234 A Pretty Place, NE\",Sometown,NY,12345-6789,USA"
1486 * data[4] = "bbbaaacccdddaaaaaqqqq"
1487 * data[5] = "bbbaaacccdddaaaaaqqqqa" # should get trailing qqqa
1489 * for (i = 1; i in data; i++) {
1490 * printf("Splitting: <%s>\n", data[i])
1491 * n = mypatsplit(data[i], fields, fpat[i], seps)
1493 * for (j = 1; j <= n; j++)
1494 * printf("fields[%d] = <%s>\n", j, fields[j])
1495 * for (j = 0; j in seps; j++)
1496 * printf("seps[%s] = <%s>\n", j, seps[j])
1500 * function mypatsplit(string, array, pattern, seps,
1501 * eosflag, non_empty, nf) # locals
1505 * if (length(string) == 0)
1508 * eosflag = non_empty = FALSE
1510 * while (match(string, pattern)) {
1511 * if (RLENGTH > 0) { # easy case
1513 * if (! (nf in seps)) {
1514 * if (RSTART == 1) # match at front of string
1517 * seps[nf] = substr(string, 1, RSTART - 1)
1519 * array[++nf] = substr(string, RSTART, RLENGTH)
1520 * string = substr(string, RSTART+RLENGTH)
1521 * if (length(string) == 0)
1523 * } else if (non_empty) {
1524 * # last match was non-empty, and at the
1525 * # current character we get a zero length match,
1526 * # which we don't want, so skip over it
1528 * seps[nf] = substr(string, 1, 1)
1529 * string = substr(string, 2)
1532 * if (! (nf in seps)) {
1536 * seps[nf] = substr(string, 1, RSTART - 1)
1539 * if (! non_empty && ! eosflag) { # prev was empty
1540 * seps[nf] = substr(string, 1, 1)
1542 * if (RSTART == 1) {
1543 * string = substr(string, 2)
1545 * string = substr(string, RSTART + 1)
1549 * if (length(string) == 0) {
1556 * if (length(string) > 0)
1559 * return length(array)
1563 fpat_parse_field(long up_to, /* parse only up to this field number */
1564 char **buf, /* on input: string to parse; on output: point to start next */
1566 NODE *fs ATTRIBUTE_UNUSED,
1568 Setfunc set, /* routine to set the value of the parsed field */
1570 NODE *sep_arr, /* array of field separators (may be NULL) */
1574 long nf = parse_high_water;
1576 char *end = scan + len;
1577 int regex_flags = RE_NEED_START;
1578 int need_to_set_sep;
1584 if (gawk_mb_cur_max > 1)
1585 memset(&mbs, 0, sizeof(mbstate_t));
1588 if (up_to == UNLIMITED)
1594 if (rp == NULL) /* use FPAT */
1598 regex_flags |= RE_NO_BOL;
1599 non_empty = rp->non_empty;
1604 need_to_set_sep = TRUE;
1606 while (research(rp, scan, 0, (end - scan), regex_flags) != -1
1609 if (REEND(rp, scan) > RESTART(rp, scan)) { /* if (RLENGTH > 0) */
1611 if (sep_arr != NULL && need_to_set_sep) {
1612 if (RESTART(rp, scan) == 0) /* match at front */
1613 set_element(nf, start, 0L, sep_arr);
1617 (long) RESTART(rp, scan),
1620 /* field is text that matched */
1622 scan + RESTART(rp, scan),
1623 (long)(REEND(rp, scan) - RESTART(rp, scan)),
1626 scan += REEND(rp, scan);
1629 need_to_set_sep = TRUE;
1630 } else if (non_empty) { /* else if non_empty */
1632 * last match was non-empty, and at the
1633 * current character we get a zero length match,
1634 * which we don't want, so skip over it
1637 if (sep_arr != NULL) {
1638 need_to_set_sep = FALSE;
1639 set_element(nf, start, 1L, sep_arr);
1641 increment_scan(& scan, end - scan);
1643 /* 0 length match */
1644 if (sep_arr != NULL && need_to_set_sep) {
1645 if (RESTART(rp, scan) == 0) /* RSTART == 1 */
1646 set_element(nf, start, 0L, sep_arr);
1648 set_element(nf, start,
1649 (long) RESTART(rp, scan),
1652 need_to_set_sep = TRUE;
1653 (*set)(++nf, scan, 0L, n);
1654 if (! non_empty && ! eosflag) { /* prev was empty */
1655 if (sep_arr != NULL) {
1656 set_element(nf, start, 1L, sep_arr);
1657 need_to_set_sep = FALSE;
1660 if (RESTART(rp, scan) == 0)
1661 increment_scan(& scan, end - scan);
1663 scan += RESTART(rp, scan);
1667 if (scan >= end) { /* length(string) == 0 */
1677 if (sep_arr != NULL)
1678 set_element(nf, scan, (long) (end - scan), sep_arr);
1682 rp->non_empty = non_empty;