2 * builtin.c - Builtin functions and various utility procedures.
6 * Copyright (C) 1986, 1988, 1989, 1991-2012 the Free Software Foundation, Inc.
8 * This file is part of GAWK, the GNU implementation of the
9 * AWK Programming Language.
11 * GAWK is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 3 of the License, or
14 * (at your option) any later version.
16 * GAWK is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
28 #if defined(HAVE_FCNTL_H)
33 #include "floatmagic.h"
35 #if defined(HAVE_POPEN_H)
43 /* The extra casts work around common compiler bugs. */
44 #define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
45 /* The outer cast is needed to work around a bug in Cray C 5.0.3.0.
46 It is necessary at least when t == time_t. */
47 #define TYPE_MINIMUM(t) ((t) (TYPE_SIGNED (t) \
48 ? ~ (t) 0 << (sizeof (t) * CHAR_BIT - 1) : (t) 0))
49 #define TYPE_MAXIMUM(t) ((t) (~ (t) 0 - TYPE_MINIMUM (t)))
52 # define INTMAX_MIN TYPE_MINIMUM (intmax_t)
55 # define UINTMAX_MAX TYPE_MAXIMUM (uintmax_t)
58 #ifndef SIZE_MAX /* C99 constant, can't rely on it everywhere */
59 #define SIZE_MAX ((size_t) -1)
62 #define DEFAULT_G_PRECISION 6
64 static size_t mbc_byte_count(const char *ptr, size_t numchars);
65 static size_t mbc_char_count(const char *ptr, size_t numbytes);
67 /* Can declare these, since we always use the random shipped with gawk */
68 extern char *initstate(unsigned long seed, char *state, long n);
69 extern char *setstate(char *state);
70 extern long random(void);
71 extern void srandom(unsigned long seed);
73 extern NODE **args_array;
75 extern NODE **fields_arr;
76 extern int output_is_tty;
77 extern FILE *output_fp;
80 #define POP_TWO_SCALARS(s1, s2) \
83 if ((s1)->type == Node_var_array) \
84 DEREF(s2), fatal(_("attempt to use array `%s' in a scalar context"), array_vname(s1)), 0
88 * Since we supply the version of random(), we know what
91 #define GAWK_RANDOM_MAX 0x7fffffffL
93 static void efwrite(const void *ptr, size_t size, size_t count, FILE *fp,
94 const char *from, struct redirect *rp, int flush);
96 /* efwrite --- like fwrite, but with error checking */
99 efwrite(const void *ptr,
108 if (fwrite(ptr, size, count, fp) != count)
111 && ((fp == stdout && output_is_tty)
112 || (rp != NULL && (rp->flag & RED_NOBUF)))) {
120 fatal(_("%s to \"%s\" failed (%s)"), from,
121 rp ? rp->value : _("standard output"),
122 errno ? strerror(errno) : _("reason unknown"));
125 /* do_exp --- exponential function */
134 if (do_lint && (tmp->flags & (NUMCUR|NUMBER)) == 0)
135 lintwarn(_("exp: received non-numeric argument"));
136 d = force_number(tmp);
141 warning(_("exp: argument %g is out of range"), d);
142 return make_number((AWKNUM) res);
145 /* stdfile --- return fp for a standard file */
148 * This function allows `fflush("/dev/stdout")' to work.
149 * The other files will be available via getredirect().
150 * /dev/stdin is not included, since fflush is only for output.
154 stdfile(const char *name, size_t len)
157 if (strncmp(name, "/dev/stderr", 11) == 0)
159 else if (strncmp(name, "/dev/stdout", 11) == 0)
166 /* do_fflush --- flush output, either named file or pipe or everything */
177 /* fflush() --- flush stdout */
179 if (output_fp != stdout)
180 (void) fflush(output_fp);
181 status = fflush(stdout);
182 return make_number((AWKNUM) status);
188 /* fflush("") --- flush all */
189 if (tmp->stlen == 0) {
192 return make_number((AWKNUM) status);
195 rp = getredirect(tmp->stptr, tmp->stlen);
198 if ((rp->flag & (RED_WRITE|RED_APPEND)) == 0) {
199 if (rp->flag & RED_PIPE)
200 warning(_("fflush: cannot flush: pipe `%s' opened for reading, not writing"),
203 warning(_("fflush: cannot flush: file `%s' opened for reading, not writing"),
206 return make_number((AWKNUM) status);
211 } else if ((fp = stdfile(tmp->stptr, tmp->stlen)) != NULL) {
215 warning(_("fflush: `%s' is not an open file, pipe or co-process"), file);
218 return make_number((AWKNUM) status);
222 /* strncasecmpmbs --- like strncasecmp (multibyte string version) */
225 strncasecmpmbs(const unsigned char *s1, const unsigned char *s2, size_t n)
227 size_t i1, i2, mbclen1, mbclen2, gap;
229 mbstate_t mbs1, mbs2;
231 memset(& mbs1, 0, sizeof(mbs1));
232 memset(& mbs2, 0, sizeof(mbs2));
234 for (i1 = i2 = 0 ; i1 < n && i2 < n ;i1 += mbclen1, i2 += mbclen2) {
235 if (is_valid_character(s1[i1])) {
237 wc1 = btowc_cache(s1[i1]);
239 mbclen1 = mbrtowc(& wc1, (const char *)s1 + i1,
241 if (mbclen1 == (size_t) -1 || mbclen1 == (size_t) -2 || mbclen1 == 0) {
242 /* We treat it as a singlebyte character. */
244 wc1 = btowc_cache(s1[i1]);
247 if (is_valid_character(s2[i2])) {
249 wc2 = btowc_cache(s2[i2]);
251 mbclen2 = mbrtowc(& wc2, (const char *)s2 + i2,
253 if (mbclen2 == (size_t) -1 || mbclen2 == (size_t) -2 || mbclen2 == 0) {
254 /* We treat it as a singlebyte character. */
256 wc2 = btowc_cache(s2[i2]);
259 if ((gap = towlower(wc1) - towlower(wc2)) != 0)
260 /* s1 and s2 are not equivalent. */
263 /* s1 and s2 are equivalent. */
267 /* Inspect the buffer `src' and write the index of each byte to `dest'.
268 Caller must allocate `dest'.
269 e.g. str = <mb1(1)>, <mb1(2)>, a, b, <mb2(1)>, <mb2(2)>, <mb2(3)>, c
270 where mb(i) means the `i'-th byte of a multibyte character.
271 dest = 1, 2, 1, 1, 1, 2, 3. 1
274 index_multibyte_buffer(char* src, char* dest, int len)
277 mbstate_t mbs, prevs;
279 memset(& prevs, 0, sizeof(mbstate_t));
280 for (idx = prev_idx = 0 ; idx < len ; idx++) {
283 mbclen = mbrlen(src + prev_idx, idx - prev_idx + 1, & mbs);
284 if (mbclen == (size_t) -1 || mbclen == 1 || mbclen == 0) {
285 /* singlebyte character. */
288 } else if (mbclen == (size_t) -2) {
289 /* a part of a multibyte character. */
290 mbclen = idx - prev_idx + 1;
291 } else if (mbclen > 1) {
292 /* the end of a multibyte character. */
302 /* a dummy function */
304 index_multibyte_buffer(char* src ATTRIBUTE_UNUSED, char* dest ATTRIBUTE_UNUSED, int len ATTRIBUTE_UNUSED)
310 /* do_index --- find index of a string */
320 int do_single_byte = FALSE;
321 mbstate_t mbs1, mbs2;
323 if (gawk_mb_cur_max > 1) {
324 memset(& mbs1, 0, sizeof(mbstate_t));
325 memset(& mbs2, 0, sizeof(mbstate_t));
329 POP_TWO_SCALARS(s1, s2);
332 if ((s1->flags & (STRING|STRCUR)) == 0)
333 lintwarn(_("index: received non-string first argument"));
334 if ((s2->flags & (STRING|STRCUR)) == 0)
335 lintwarn(_("index: received non-string second argument"));
346 * Icky special case, index(foo, "") should return 1,
347 * since both bwk awk and mawk do, and since match("foo", "")
348 * returns 1. This makes index("", "") work, too, fwiw.
356 if (gawk_mb_cur_max > 1) {
357 s1 = force_wstring(s1);
358 s2 = force_wstring(s2);
360 * If we don't have valid wide character strings, use
363 do_single_byte = ((s1->wstlen == 0 && s1->stlen > 0)
364 || (s2->wstlen == 0 && s2->stlen > 0));
368 /* IGNORECASE will already be false if posix */
374 if (! do_single_byte && gawk_mb_cur_max > 1) {
377 pos = wcasestrstr(s1->wstptr, s1->wstlen, s2->wstptr, s2->wstlen);
381 ret = pos - s1->wstptr + 1; /* 1-based */
386 * Could use tolower(*p1) == tolower(*p2) here.
387 * See discussion in eval.c as to why not.
389 if (casetable[(unsigned char)*p1] == casetable[(unsigned char)*p2]
390 && (l2 == 1 || strncasecmp(p1, p2, l2) == 0)) {
391 ret = 1 + s1->stlen - l1;
405 && (l2 == 1 || (l2 > 0 && memcmp(p1, p2, l2) == 0))) {
406 ret = 1 + s1->stlen - l1;
410 if (! do_single_byte && gawk_mb_cur_max > 1) {
413 pos = wstrstr(s1->wstptr, s1->wstlen, s2->wstptr, s2->wstlen);
417 ret = pos - s1->wstptr + 1; /* 1-based */
432 return make_number((AWKNUM) ret);
435 /* double_to_int --- convert double to int, used several places */
438 double_to_int(double d)
447 /* do_int --- convert double to int for awk */
456 if (do_lint && (tmp->flags & (NUMCUR|NUMBER)) == 0)
457 lintwarn(_("int: received non-numeric argument"));
458 d = force_number(tmp);
459 d = double_to_int(d);
461 return make_number((AWKNUM) d);
464 /* do_isarray --- check if argument is array */
467 do_isarray(int nargs)
473 if (tmp->type != Node_var_array) {
477 return make_number((AWKNUM) ret);
480 /* do_length --- length of a string, array or $0 */
489 if (tmp->type == Node_var_array) {
490 static short warned = FALSE;
493 fatal(_("length: received array argument"));
494 if (do_lint && ! warned) {
496 lintwarn(_("`length(array)' is a gawk extension"));
498 return make_number((AWKNUM) tmp->table_size);
501 assert(tmp->type == Node_val);
503 if (do_lint && (tmp->flags & (STRING|STRCUR)) == 0)
504 lintwarn(_("length: received non-string argument"));
505 (void) force_string(tmp);
508 if (gawk_mb_cur_max > 1) {
509 tmp = force_wstring(tmp);
512 * If the bytes don't make a valid wide character
513 * string, fall back to the bytes themselves.
515 if (len == 0 && tmp->stlen > 0)
522 return make_number((AWKNUM) len);
525 /* do_log --- the log function */
534 if (do_lint && (tmp->flags & (NUMCUR|NUMBER)) == 0)
535 lintwarn(_("log: received non-numeric argument"));
536 arg = (double) force_number(tmp);
538 warning(_("log: received negative argument %g"), arg);
541 return make_number((AWKNUM) d);
546 * format_tree() formats arguments of sprintf,
547 * and accordingly to a fmt_string providing a format like in
548 * printf family from C library. Returns a string node which value
549 * is a formatted string. Called by sprintf function.
551 * It is one of the uglier parts of gawk. Thanks to Michal Jaegermann
552 * for taming this beast and making it compatible with ANSI C.
557 const char *fmt_string,
562 /* copy 'l' bytes from 's' to 'obufout' checking for space in the process */
563 /* difference of pointers should be of ptrdiff_t type, but let us be kind */
564 #define bchunk(s, l) if (l) { \
565 while ((l) > ofre) { \
566 size_t olen = obufout - obuf; \
567 erealloc(obuf, char *, osiz * 2, "format_tree"); \
570 obufout = obuf + olen; \
572 memcpy(obufout, s, (size_t) (l)); \
577 /* copy one byte from 's' to 'obufout' checking for space in the process */
578 #define bchunk_one(s) { \
580 size_t olen = obufout - obuf; \
581 erealloc(obuf, char *, osiz * 2, "format_tree"); \
584 obufout = obuf + olen; \
590 /* Is there space for something L big in the buffer? */
591 #define chksize(l) if ((l) >= ofre) { \
592 size_t olen = obufout - obuf; \
593 size_t delta = osiz+l-ofre; \
594 erealloc(obuf, char *, osiz + delta, "format_tree"); \
595 obufout = obuf + olen; \
604 char *obuf, *obufout;
610 long fw, prec, argnum;
612 int lj, alt, big_flag, bigbig_flag, small_flag, have_prec, need_format;
618 * Although this is an array, the elements serve two different
619 * purposes. The first element is the general buffer meant
620 * to hold the entire result string. The second one is a
621 * temporary buffer for large floating point values. They
622 * could just as easily be separate variables, and the
623 * code might arguably be clearer.
630 #define cpbuf cpbufs[0].buf
631 char *cend = &cpbufs[0].stackbuf[sizeof(cpbufs[0].stackbuf)];
635 char signchar = FALSE;
637 int zero_flag = FALSE;
638 int quote_flag = FALSE;
641 size_t copy_count, char_count;
642 static const char sp[] = " ";
643 static const char zero_string[] = "0";
644 static const char lchbuf[] = "0123456789abcdef";
645 static const char Uchbuf[] = "0123456789ABCDEF";
647 #define INITIAL_OUT_SIZE 512
648 emalloc(obuf, char *, INITIAL_OUT_SIZE, "format_tree");
650 osiz = INITIAL_OUT_SIZE;
657 for (k = 0; k < sizeof(cpbufs)/sizeof(cpbufs[0]); k++) {
658 cpbufs[k].bufsize = sizeof(cpbufs[k].stackbuf);
659 cpbufs[k].buf = cpbufs[k].stackbuf;
664 * The point of this goop is to grow the buffer
665 * holding the converted number, so that large
666 * values don't overflow a fixed length buffer.
668 #define PREPEND(CH) do { \
669 if (cp == cpbufs[0].buf) { \
670 char *prev = cpbufs[0].buf; \
671 emalloc(cpbufs[0].buf, char *, 2*cpbufs[0].bufsize, \
673 memcpy((cp = cpbufs[0].buf+cpbufs[0].bufsize), prev, \
674 cpbufs[0].bufsize); \
675 cpbufs[0].bufsize *= 2; \
676 if (prev != cpbufs[0].stackbuf) \
678 cend = cpbufs[0].buf+cpbufs[0].bufsize; \
684 * Check first for use of `count$'.
685 * If plain argument retrieval was used earlier, choke.
686 * Otherwise, return the requested argument.
687 * If not `count$' now, but it was used earlier, choke.
688 * If this format is more than total number of args, choke.
689 * Otherwise, return the current argument.
691 #define parse_next_arg() { \
694 msg(_("fatal: must use `count$' on all formats or none")); \
697 arg = the_args[argnum]; \
698 } else if (used_dollar) { \
699 msg(_("fatal: must use `count$' on all formats or none")); \
700 arg = 0; /* shutup the compiler */ \
702 } else if (cur_arg >= num_args) { \
703 arg = 0; /* shutup the compiler */ \
707 arg = the_args[cur_arg]; \
715 s0 = s1 = fmt_string;
733 lj = alt = big_flag = bigbig_flag = small_flag = FALSE;
740 if (n0-- == 0) /* ran out early! */
743 switch (cs1 = *s1++) {
744 case (-1): /* dummy case to allow for checking */
747 break; /* reject as a valid format */
753 * The C99 standard pages 274 and 279 seem to imply that
754 * since there's no arg converted, the field width doesn't
755 * apply. The code already was that way, but this
756 * comment documents it, at least in the code.
759 const char *msg = NULL;
761 if (fw && ! have_prec)
762 msg = _("field width is ignored for `%%' specifier");
763 else if (fw == 0 && have_prec)
764 msg = _("precision is ignored for `%%' specifier");
765 else if (fw && have_prec)
766 msg = _("field width and precision are ignored for `%%' specifier");
777 * Only turn on zero_flag if we haven't seen
778 * the field width or precision yet. Otherwise,
779 * screws up floating point formatting.
800 * with a negative precision *cur is already set
801 * to -1, so it will remain negative, but we have
802 * to "eat" precision digits in any case
804 while (n0 > 0 && *s1 >= '0' && *s1 <= '9') {
806 *cur = *cur * 10 + *s1++ - '0';
808 if (prec < 0) /* negative precision is discarded */
812 if (n0 == 0) /* badly formatted control string */
816 if (do_traditional) {
817 msg(_("fatal: `$' is not permitted in awk formats"));
826 msg(_("fatal: arg count with `$' must be > 0"));
829 if (argnum >= num_args) {
830 msg(_("fatal: arg count %ld greater than total number of supplied arguments"), argnum);
834 msg(_("fatal: `$' not permitted after period in format"));
842 if (! do_traditional && isdigit((unsigned char) *s1)) {
845 for (; n0 > 0 && *s1 && isdigit((unsigned char) *s1); s1++, n0--) {
850 msg(_("fatal: no `$' supplied for positional field width or precision"));
856 if (val >= num_args) {
864 *cur = force_number(arg);
865 if (*cur < 0 && cur == &fw) {
877 case ' ': /* print ' ' or '-' */
878 /* 'space' flag is ignored */
879 /* if '+' already present */
880 if (signchar != FALSE)
883 case '+': /* print '+' or '-' */
893 fill = sp; /* if left justified then other */
894 lj++; /* filling is ignored */
906 #if defined(HAVE_LOCALE_H)
907 /* allow quote_flag if there is a thousands separator. */
908 if (loc.thousands_sep[0] != '\0')
918 static short warned = FALSE;
920 if (do_lint && ! warned) {
921 lintwarn(_("`l' is meaningless in awk formats; ignored"));
925 msg(_("fatal: `l' is not permitted in POSIX awk formats"));
935 static short warned = FALSE;
937 if (do_lint && ! warned) {
938 lintwarn(_("`L' is meaningless in awk formats; ignored"));
942 msg(_("fatal: `L' is not permitted in POSIX awk formats"));
952 static short warned = FALSE;
954 if (do_lint && ! warned) {
955 lintwarn(_("`h' is meaningless in awk formats; ignored"));
959 msg(_("fatal: `h' is not permitted in POSIX awk formats"));
968 /* user input that looks numeric is numeric */
969 if ((arg->flags & (MAYBE_NUM|NUMBER)) == MAYBE_NUM)
970 (void) force_number(arg);
971 if (arg->flags & NUMBER) {
972 uval = (uintmax_t) arg->numbr;
974 if (gawk_mb_cur_max > 1) {
980 memset(& mbs, 0, sizeof(mbs));
983 count = wcrtomb(buf, wc, & mbs);
985 || count == (size_t)-1
986 || count == (size_t)-2)
989 memcpy(cpbuf, buf, count);
999 if (do_lint && uval > 255) {
1000 lintwarn("[s]printf: value %g is too big for %%c format",
1009 * As per POSIX, only output first character of a
1010 * string value. Thus, we ignore any provided
1011 * precision, forcing it to 1. (Didn't this
1012 * used to work? 6/2003.)
1017 * First character can be multiple bytes if
1018 * it's a multibyte character. Grr.
1020 if (gawk_mb_cur_max > 1) {
1024 memset(& state, 0, sizeof(state));
1025 count = mbrlen(cp, arg->stlen, & state);
1027 || count == (size_t)-1
1028 || count == (size_t)-2)
1039 need_format = FALSE;
1041 arg = force_string(arg);
1042 if (fw == 0 && ! have_prec)
1045 char_count = mbc_char_count(arg->stptr, arg->stlen);
1046 if (! have_prec || prec > char_count)
1053 need_format = FALSE;
1055 tmpval = force_number(arg);
1057 * Check for Nan or Inf.
1059 if (isnan(tmpval) || isinf(tmpval))
1062 tmpval = double_to_int(tmpval);
1065 * ``The result of converting a zero value with a
1066 * precision of zero is no characters.''
1068 if (have_prec && prec == 0 && tmpval == 0)
1076 /* avoid printing -0 */
1081 * Use snprintf return value to tell if there
1082 * is enough room in the buffer or not.
1084 while ((i = snprintf(cpbufs[1].buf,
1085 cpbufs[1].bufsize, "%.0f",
1087 cpbufs[1].bufsize) {
1088 if (cpbufs[1].buf == cpbufs[1].stackbuf)
1089 cpbufs[1].buf = NULL;
1091 cpbufs[1].bufsize += ((i > cpbufs[1].bufsize) ?
1092 i : cpbufs[1].bufsize);
1095 cpbufs[1].bufsize *= 2;
1096 assert(cpbufs[1].bufsize > 0);
1097 erealloc(cpbufs[1].buf, char *,
1098 cpbufs[1].bufsize, "format_tree");
1102 chp = &cpbufs[1].buf[i-1];
1107 #if defined(HAVE_LOCALE_H)
1108 if (quote_flag && loc.grouping[ii] && ++jj == loc.grouping[ii]) {
1109 if (i) /* only add if more digits coming */
1110 PREPEND(loc.thousands_sep[0]); /* XXX - assumption it's one char */
1111 if (loc.grouping[ii+1] == 0)
1112 jj = 0; /* keep using current val in loc.grouping[ii] */
1113 else if (loc.grouping[ii+1] == CHAR_MAX)
1123 /* add more output digits to match the precision */
1125 while (cend - cp < prec)
1134 * When to fill with zeroes is of course not simple.
1135 * First: No zero fill if left-justifying.
1136 * Next: There seem to be two cases:
1137 * A '0' without a precision, e.g. %06d
1138 * A precision with no field width, e.g. %.10d
1139 * Any other case, we don't want to fill with zeroes.
1142 && ((zero_flag && ! have_prec)
1143 || (fw == 0 && have_prec)))
1148 if (fw > prec && ! lj && fill != sp
1149 && (*cp == '-' || signchar)) {
1157 chbuf = Uchbuf; /* FALL THROUGH */
1159 base += 6; /* FALL THROUGH */
1161 base += 2; /* FALL THROUGH */
1164 need_format = FALSE;
1166 tmpval = force_number(arg);
1169 * ``The result of converting a zero value with a
1170 * precision of zero is no characters.''
1172 * If I remember the ANSI C standard, though,
1173 * it says that for octal conversions
1174 * the precision is artificially increased
1175 * to add an extra 0 if # is supplied.
1177 * printf("%#.0o\n", 0);
1178 * prints a single 0.
1180 if (! alt && have_prec && prec == 0 && tmpval == 0)
1184 uval = (uintmax_t) (intmax_t) tmpval;
1185 if ((AWKNUM)(intmax_t)uval !=
1186 double_to_int(tmpval))
1189 uval = (uintmax_t) tmpval;
1190 if ((AWKNUM)uval != double_to_int(tmpval))
1194 * When to fill with zeroes is of course not simple.
1195 * First: No zero fill if left-justifying.
1196 * Next: There seem to be two cases:
1197 * A '0' without a precision, e.g. %06d
1198 * A precision with no field width, e.g. %.10d
1199 * Any other case, we don't want to fill with zeroes.
1202 && ((zero_flag && ! have_prec)
1203 || (fw == 0 && have_prec)))
1207 PREPEND(chbuf[uval % base]);
1209 #if defined(HAVE_LOCALE_H)
1210 if (base == 10 && quote_flag && loc.grouping[ii] && ++jj == loc.grouping[ii]) {
1211 if (uval) /* only add if more digits coming */
1212 PREPEND(loc.thousands_sep[0]); /* XXX --- assumption it's one char */
1213 if (loc.grouping[ii+1] == 0)
1214 jj = 0; /* keep using current val in loc.grouping[ii] */
1215 else if (loc.grouping[ii+1] == CHAR_MAX)
1225 /* add more output digits to match the precision */
1227 while (cend - cp < prec)
1231 if (alt && tmpval != 0) {
1240 } else if (base == 8)
1255 if (fw == 0 && ! have_prec)
1257 else if (gawk_mb_cur_max > 1 && (cs1 == 's' || cs1 == 'c')) {
1258 assert(cp == arg->stptr || cp == cpbuf);
1259 copy_count = mbc_byte_count(arg->stptr, prec);
1261 bchunk(cp, copy_count);
1270 /* out of range - emergency use of %g format */
1272 lintwarn(_("[s]printf: value %g is out of range for `%%%c' format"),
1273 (double) tmpval, cs1);
1278 #if ! defined(PRINTF_HAS_F_FORMAT) || PRINTF_HAS_F_FORMAT != 1
1287 need_format = FALSE;
1289 tmpval = force_number(arg);
1292 prec = DEFAULT_G_PRECISION;
1293 chksize(fw + prec + 9); /* 9 == slop */
1310 #if defined(LC_NUMERIC)
1311 if (quote_flag && ! use_lc_numeric)
1312 setlocale(LC_NUMERIC, "");
1316 while ((n = snprintf(obufout, ofre, cpbuf,
1317 (int) fw, (int) prec,
1318 (double) tmpval)) >= ofre)
1321 #if defined(LC_NUMERIC)
1322 if (quote_flag && ! use_lc_numeric)
1323 setlocale(LC_NUMERIC, "C");
1325 len = strlen(obufout);
1331 if (do_lint && isalpha(cs1))
1332 lintwarn(_("ignoring unknown format specifier character `%c': no argument converted"), cs1);
1336 msg("%s\n\t`%s'\n\t%*s%s",
1337 _("fatal: not enough arguments to satisfy format string"),
1338 fmt_string, (int) (s1 - fmt_string - 1), "",
1339 _("^ ran out for this one"));
1346 _("[s]printf: format specifier does not have control letter"));
1347 if (cur_arg < num_args)
1349 _("too many arguments supplied for format string"));
1351 bchunk(s0, s1 - s0);
1352 r = make_str_node(obuf, obufout - obuf, ALREADY_MALLOCED);
1357 size_t count = sizeof(cpbufs)/sizeof(cpbufs[0]);
1358 for (k = 0; k < count; k++) {
1359 if (cpbufs[k].buf != cpbufs[k].stackbuf)
1360 efree(cpbufs[k].buf);
1366 gawk_exit(EXIT_FATAL);
1371 /* printf_common --- common code for sprintf and printf */
1374 printf_common(int nargs)
1379 assert(nargs <= max_args);
1380 for (i = 1; i <= nargs; i++) {
1381 tmp = args_array[nargs - i] = POP();
1382 if (tmp->type == Node_var_array) {
1384 DEREF(args_array[nargs - i]);
1385 fatal(_("attempt to use array `%s' in a scalar context"), array_vname(tmp));
1389 force_string(args_array[0]);
1390 r = format_tree(args_array[0]->stptr, args_array[0]->stlen, args_array, nargs);
1391 for (i = 0; i < nargs; i++)
1392 DEREF(args_array[i]);
1396 /* do_sprintf --- perform sprintf */
1399 do_sprintf(int nargs)
1402 r = printf_common(nargs);
1404 gawk_exit(EXIT_FATAL);
1409 /* do_printf --- perform printf, including redirection */
1412 do_printf(int nargs, int redirtype)
1416 struct redirect *rp = NULL;
1417 int errflg; /* not used, sigh */
1418 NODE *redir_exp = NULL;
1421 if (do_traditional) {
1423 lintwarn(_("printf: no arguments"));
1424 if (redirtype != 0) {
1426 if (redir_exp->type != Node_val)
1427 fatal(_("attempt to use array `%s' in a scalar context"), array_vname(redir_exp));
1428 rp = redirect(redir_exp, redirtype, & errflg);
1432 return; /* bwk accepts it silently */
1434 fatal(_("printf: no arguments"));
1437 if (redirtype != 0) {
1438 redir_exp = PEEK(nargs);
1439 if (redir_exp->type != Node_val)
1440 fatal(_("attempt to use array `%s' in a scalar context"), array_vname(redir_exp));
1441 rp = redirect(redir_exp, redirtype, & errflg);
1447 tmp = printf_common(nargs);
1448 if (redir_exp != NULL) {
1457 efwrite(tmp->stptr, sizeof(char), tmp->stlen, fp, "printf", rp, TRUE);
1458 if (rp != NULL && (rp->flag & RED_TWOWAY) != 0)
1462 gawk_exit(EXIT_FATAL);
1465 /* do_sqrt --- do the sqrt function */
1474 if (do_lint && (tmp->flags & (NUMCUR|NUMBER)) == 0)
1475 lintwarn(_("sqrt: received non-numeric argument"));
1476 arg = (double) force_number(tmp);
1479 warning(_("sqrt: called with negative argument %g"), arg);
1480 return make_number((AWKNUM) sqrt(arg));
1483 /* do_substr --- do the substr function */
1486 do_substr(int nargs)
1492 double d_index = 0, d_length = 0;
1496 POP_NUMBER(d_length);
1497 POP_NUMBER(d_index);
1501 if (! (d_length >= 1)) {
1502 if (do_lint == LINT_ALL)
1503 lintwarn(_("substr: length %g is not >= 1"), d_length);
1504 else if (do_lint == LINT_INVALID && ! (d_length >= 0))
1505 lintwarn(_("substr: length %g is not >= 0"), d_length);
1507 return Nnull_string;
1510 if (double_to_int(d_length) != d_length)
1512 _("substr: non-integer length %g will be truncated"),
1515 if (d_length > SIZE_MAX)
1517 _("substr: length %g too big for string indexing, truncating to %g"),
1518 d_length, (double) SIZE_MAX);
1520 if (d_length < SIZE_MAX)
1526 /* the weird `! (foo)' tests help catch NaN values. */
1527 if (! (d_index >= 1)) {
1529 lintwarn(_("substr: start index %g is invalid, using 1"),
1533 if (do_lint && double_to_int(d_index) != d_index)
1534 lintwarn(_("substr: non-integer start index %g will be truncated"),
1537 /* awk indices are from 1, C's are from 0 */
1538 if (d_index <= SIZE_MAX)
1543 if (nargs == 2) { /* third arg. missing */
1544 /* use remainder of string */
1545 length = t1->stlen - indx; /* default to bytes */
1547 if (gawk_mb_cur_max > 1) {
1548 t1 = force_wstring(t1);
1549 if (t1->wstlen > 0) /* use length of wide char string if we have one */
1550 length = t1->wstlen - indx;
1553 d_length = length; /* set here in case used in diagnostics, below */
1556 if (t1->stlen == 0) {
1557 /* substr("", 1, 0) produces a warning only if LINT_ALL */
1558 if (do_lint && (do_lint == LINT_ALL || ((indx | length) != 0)))
1559 lintwarn(_("substr: source string is zero length"));
1561 return Nnull_string;
1564 /* get total len of input string, for following checks */
1566 if (gawk_mb_cur_max > 1) {
1567 t1 = force_wstring(t1);
1568 src_len = t1->wstlen;
1571 src_len = t1->stlen;
1573 if (indx >= src_len) {
1575 lintwarn(_("substr: start index %g is past end of string"),
1578 return Nnull_string;
1580 if (length > src_len - indx) {
1583 _("substr: length %g at start index %g exceeds length of first argument (%lu)"),
1584 d_length, d_index, (unsigned long int) src_len);
1585 length = src_len - indx;
1589 /* force_wstring() already called */
1590 if (gawk_mb_cur_max == 1 || t1->wstlen == t1->stlen)
1591 /* single byte case */
1592 r = make_string(t1->stptr + indx, length);
1594 /* multibyte case, more work */
1601 * Convert the wide chars in t1->wstptr back into m.b. chars.
1602 * This is pretty grotty, but it's the most straightforward
1605 memset(& mbs, 0, sizeof(mbs));
1606 emalloc(substr, char *, (length * gawk_mb_cur_max) + 2, "do_substr");
1607 wp = t1->wstptr + indx;
1608 for (cp = substr; length > 0; length--) {
1609 result = wcrtomb(cp, *wp, & mbs);
1610 if (result == (size_t) -1) /* what to do? break seems best */
1616 r = make_str_node(substr, cp - substr, ALREADY_MALLOCED);
1619 r = make_string(t1->stptr + indx, length);
1626 /* do_strftime --- format a time stamp */
1629 do_strftime(int nargs)
1631 NODE *t1, *t2, *t3, *ret;
1636 size_t buflen, bufsize;
1644 /* set defaults first */
1645 format = def_strftime_format; /* traditional date format */
1646 formatlen = strlen(format);
1647 (void) time(& fclock); /* current time of day */
1650 if (PROCINFO_node != NULL) {
1651 sub = make_string("strftime", 8);
1652 val = in_array(PROCINFO_node, sub);
1656 if (do_lint && (val->flags & STRING) == 0)
1657 lintwarn(_("strftime: format value in PROCINFO[\"strftime\"] has numeric type"));
1658 val = force_string(val);
1659 format = val->stptr;
1660 formatlen = val->stlen;
1664 t1 = t2 = t3 = NULL;
1665 if (nargs > 0) { /* have args */
1670 if ((t3->flags & (NUMCUR|NUMBER)) != 0)
1671 do_gmt = (t3->numbr != 0);
1673 do_gmt = (t3->stlen > 0);
1679 if (do_lint && (t2->flags & (NUMCUR|NUMBER)) == 0)
1680 lintwarn(_("strftime: received non-numeric second argument"));
1681 clock_val = (long) force_number(t2);
1683 fatal(_("strftime: second argument less than 0 or too big for time_t"));
1684 fclock = (time_t) clock_val;
1689 if (do_lint && (tmp->flags & (STRING|STRCUR)) == 0)
1690 lintwarn(_("strftime: received non-string first argument"));
1691 t1 = force_string(tmp);
1693 formatlen = t1->stlen;
1694 if (formatlen == 0) {
1696 lintwarn(_("strftime: received empty format string"));
1698 return make_string("", 0);
1703 tm = gmtime(& fclock);
1705 tm = localtime(& fclock);
1708 bufsize = sizeof(buf);
1711 buflen = strftime(bufp, bufsize, format, tm);
1713 * buflen can be zero EITHER because there's not enough
1714 * room in the string, or because the control command
1715 * goes to the empty string. Make a reasonable guess that
1716 * if the buffer is 1024 times bigger than the length of the
1717 * format string, it's not failing for lack of room.
1718 * Thanks to Paul Eggert for pointing out this issue.
1720 if (buflen > 0 || bufsize >= 1024 * formatlen)
1724 emalloc(bufp, char *, bufsize, "do_strftime");
1726 erealloc(bufp, char *, bufsize, "do_strftime");
1728 ret = make_string(bufp, buflen);
1736 /* do_systime --- get the time of day */
1739 do_systime(int nargs ATTRIBUTE_UNUSED)
1743 (void) time(& lclock);
1744 return make_number((AWKNUM) lclock);
1747 /* do_mktime --- turn a time string into a timestamp */
1750 do_mktime(int nargs)
1755 int month, day, hour, minute, second, count;
1756 int dst = -1; /* default is unknown */
1761 if (do_lint && (t1->flags & (STRING|STRCUR)) == 0)
1762 lintwarn(_("mktime: received non-string argument"));
1763 t1 = force_string(t1);
1765 save = t1->stptr[t1->stlen];
1766 t1->stptr[t1->stlen] = '\0';
1768 count = sscanf(t1->stptr, "%ld %d %d %d %d %d %d",
1769 & year, & month, & day,
1770 & hour, & minute, & second,
1773 if (do_lint /* Ready? Set! Go: */
1774 && ( (second < 0 || second > 60)
1775 || (minute < 0 || minute > 60)
1776 || (hour < 0 || hour > 23)
1777 || (day < 1 || day > 31)
1778 || (month < 1 || month > 12) ))
1779 lintwarn(_("mktime: at least one of the values is out of the default range"));
1781 t1->stptr[t1->stlen] = save;
1786 || year < INT_MIN + 1900
1787 || year - 1900 > INT_MAX)
1788 return make_number((AWKNUM) -1);
1790 memset(& then, '\0', sizeof(then));
1791 then.tm_sec = second;
1792 then.tm_min = minute;
1793 then.tm_hour = hour;
1795 then.tm_mon = month - 1;
1796 then.tm_year = year - 1900;
1797 then.tm_isdst = dst;
1799 then_stamp = mktime(& then);
1800 return make_number((AWKNUM) then_stamp);
1803 /* do_system --- run an external command */
1806 do_system(int nargs)
1814 fatal(_("'system' function not allowed in sandbox mode"));
1816 (void) flush_io(); /* so output is synchronous with gawk's */
1818 if (do_lint && (tmp->flags & (STRING|STRCUR)) == 0)
1819 lintwarn(_("system: received non-string argument"));
1820 cmd = force_string(tmp)->stptr;
1823 /* insure arg to system is zero-terminated */
1824 save = cmd[tmp->stlen];
1825 cmd[tmp->stlen] = '\0';
1827 os_restore_mode(fileno(stdin));
1830 ret = WEXITSTATUS(ret);
1831 if ((BINMODE & 1) != 0)
1832 os_setbinmode(fileno(stdin), O_BINARY);
1834 cmd[tmp->stlen] = save;
1837 return make_number((AWKNUM) ret);
1840 extern NODE **fmt_list; /* declared in eval.c */
1842 /* do_print --- print items, separated by OFS, terminated with ORS */
1845 do_print(int nargs, int redirtype)
1847 struct redirect *rp = NULL;
1848 int errflg; /* not used, sigh */
1851 NODE *redir_exp = NULL;
1854 assert(nargs <= max_args);
1856 if (redirtype != 0) {
1857 redir_exp = PEEK(nargs);
1858 if (redir_exp->type != Node_val)
1859 fatal(_("attempt to use array `%s' in a scalar context"), array_vname(redir_exp));
1860 rp = redirect(redir_exp, redirtype, & errflg);
1866 for (i = 1; i <= nargs; i++) {
1867 tmp = args_array[i] = POP();
1868 if (tmp->type == Node_var_array) {
1870 DEREF(args_array[i]);
1871 fatal(_("attempt to use array `%s' in a scalar context"), array_vname(tmp));
1873 if (do_lint && tmp->type == Node_var_new)
1874 lintwarn(_("reference to uninitialized variable `%s'"),
1876 if ((tmp->flags & (NUMBER|STRING)) == NUMBER) {
1877 if (OFMTidx == CONVFMTidx)
1878 (void) force_string(tmp);
1880 args_array[i] = format_val(OFMT, OFMTidx, tmp);
1884 if (redir_exp != NULL) {
1890 for (i = nargs; i > 0; i--)
1891 DEREF(args_array[i]);
1895 for (i = nargs; i > 0; i--) {
1896 efwrite(args_array[i]->stptr, sizeof(char), args_array[i]->stlen, fp, "print", rp, FALSE);
1897 DEREF(args_array[i]);
1898 if (i != 1 && OFSlen > 0)
1899 efwrite(OFS, sizeof(char), (size_t) OFSlen,
1900 fp, "print", rp, FALSE);
1904 efwrite(ORS, sizeof(char), (size_t) ORSlen, fp, "print", rp, TRUE);
1906 if (rp != NULL && (rp->flag & RED_TWOWAY) != 0)
1910 /* do_print_rec --- special case printing of $0, for speed */
1913 do_print_rec(int nargs, int redirtype)
1917 struct redirect *rp = NULL;
1918 int errflg; /* not used, sigh */
1919 NODE *redir_exp = NULL;
1922 if (redirtype != 0) {
1924 rp = redirect(redir_exp, redirtype, & errflg);
1936 (void) get_field(0L, NULL); /* rebuild record */
1940 if (do_lint && f0 == Nnull_string)
1941 lintwarn(_("reference to uninitialized field `$%d'"), 0);
1943 efwrite(f0->stptr, sizeof(char), f0->stlen, fp, "print", rp, FALSE);
1946 efwrite(ORS, sizeof(char), (size_t) ORSlen, fp, "print", rp, TRUE);
1948 if (rp != NULL && (rp->flag & RED_TWOWAY) != 0)
1954 /* is_wupper --- function version of iswupper for passing function pointers */
1957 is_wupper(wchar_t c)
1962 /* is_wlower --- function version of iswlower for passing function pointers */
1965 is_wlower(wchar_t c)
1970 /* to_wupper --- function version of towupper for passing function pointers */
1973 to_wlower(wchar_t c)
1978 /* to_wlower --- function version of towlower for passing function pointers */
1981 to_wupper(wchar_t c)
1986 /* wide_change_case --- generic case converter for wide characters */
1989 wide_change_case(wchar_t *wstr,
1991 int (*is_x)(wchar_t c),
1992 int (*to_y)(wchar_t c))
1997 for (i = 0, wcp = wstr; i < wlen; i++, wcp++)
2002 /* wide_toupper --- map a wide string to upper case */
2005 wide_toupper(wchar_t *wstr, size_t wlen)
2007 wide_change_case(wstr, wlen, is_wlower, to_wupper);
2010 /* wide_tolower --- map a wide string to lower case */
2013 wide_tolower(wchar_t *wstr, size_t wlen)
2015 wide_change_case(wstr, wlen, is_wupper, to_wlower);
2019 /* do_tolower --- lower case a string */
2022 do_tolower(int nargs)
2027 if (do_lint && (t1->flags & (STRING|STRCUR)) == 0)
2028 lintwarn(_("tolower: received non-string argument"));
2029 t1 = force_string(t1);
2030 t2 = make_string(t1->stptr, t1->stlen);
2032 if (gawk_mb_cur_max == 1) {
2033 unsigned char *cp, *cp2;
2035 for (cp = (unsigned char *)t2->stptr,
2036 cp2 = (unsigned char *)(t2->stptr + t2->stlen);
2044 wide_tolower(t2->wstptr, t2->wstlen);
2053 /* do_toupper --- upper case a string */
2056 do_toupper(int nargs)
2061 if (do_lint && (t1->flags & (STRING|STRCUR)) == 0)
2062 lintwarn(_("toupper: received non-string argument"));
2063 t1 = force_string(t1);
2064 t2 = make_string(t1->stptr, t1->stlen);
2066 if (gawk_mb_cur_max == 1) {
2067 unsigned char *cp, *cp2;
2069 for (cp = (unsigned char *)t2->stptr,
2070 cp2 = (unsigned char *)(t2->stptr + t2->stlen);
2078 wide_toupper(t2->wstptr, t2->wstlen);
2087 /* do_atan2 --- do the atan2 function */
2095 POP_TWO_SCALARS(t1, t2);
2097 if ((t1->flags & (NUMCUR|NUMBER)) == 0)
2098 lintwarn(_("atan2: received non-numeric first argument"));
2099 if ((t2->flags & (NUMCUR|NUMBER)) == 0)
2100 lintwarn(_("atan2: received non-numeric second argument"));
2102 d1 = force_number(t1);
2103 d2 = force_number(t2);
2106 return make_number((AWKNUM) atan2(d1, d2));
2109 /* do_sin --- do the sin function */
2118 if (do_lint && (tmp->flags & (NUMCUR|NUMBER)) == 0)
2119 lintwarn(_("sin: received non-numeric argument"));
2120 d = sin((double) force_number(tmp));
2122 return make_number((AWKNUM) d);
2125 /* do_cos --- do the cos function */
2134 if (do_lint && (tmp->flags & (NUMCUR|NUMBER)) == 0)
2135 lintwarn(_("cos: received non-numeric argument"));
2136 d = cos((double) force_number(tmp));
2138 return make_number((AWKNUM) d);
2141 /* do_rand --- do the rand function */
2143 static int firstrand = TRUE;
2144 /* Some systems require this array to be integer aligned. Sigh. */
2145 #define SIZEOF_STATE 256
2146 static uint32_t istate[SIZEOF_STATE/sizeof(uint32_t)];
2147 static char *const state = (char *const) istate;
2151 do_rand(int nargs ATTRIBUTE_UNUSED)
2154 (void) initstate((unsigned) 1, state, SIZEOF_STATE);
2155 /* don't need to srandom(1), initstate() does it for us. */
2160 * Per historical practice and POSIX, return value N is
2164 return make_number((AWKNUM) (random() % GAWK_RANDOM_MAX) / GAWK_RANDOM_MAX);
2167 /* do_srand --- seed the random number generator */
2173 static long save_seed = 1;
2174 long ret = save_seed; /* SVR4 awk srand returns previous seed */
2177 (void) initstate((unsigned) 1, state, SIZEOF_STATE);
2178 /* don't need to srandom(1), we're changing the seed below */
2180 (void) setstate(state);
2184 srandom((unsigned int) (save_seed = (long) time((time_t *) 0)));
2187 if (do_lint && (tmp->flags & (NUMCUR|NUMBER)) == 0)
2188 lintwarn(_("srand: received non-numeric argument"));
2189 srandom((unsigned int) (save_seed = (long) force_number(tmp)));
2192 return make_number((AWKNUM) ret);
2195 /* do_match --- match a regexp, set RSTART and RLENGTH,
2196 * optional third arg is array filled with text of
2197 * subpatterns enclosed in parens and start and len info.
2203 NODE *tre, *t1, *dest, *it;
2204 int rstart, len, ii;
2211 size_t amt, oldamt = 0, ilen, slen;
2216 if (nargs == 3) { /* 3rd optional arg for the subpatterns */
2218 if (dest->type != Node_var_array)
2219 fatal(_("match: third argument is not an array"));
2223 rp = re_update(tre);
2226 rstart = research(rp, t1->stptr, 0, t1->stlen, RE_NEED_START);
2227 if (rstart >= 0) { /* match succeded */
2228 size_t *wc_indices = NULL;
2230 rlength = REEND(rp, t1->stptr) - RESTART(rp, t1->stptr); /* byte length */
2232 if (rlength > 0 && gawk_mb_cur_max > 1) {
2233 t1 = str2wstr(t1, & wc_indices);
2234 rlength = wc_indices[rstart + rlength - 1] - wc_indices[rstart] + 1;
2235 rstart = wc_indices[rstart];
2238 rstart++; /* now it's 1-based indexing */
2240 /* Build the array only if the caller wants the optional subpatterns */
2242 subsepstr = SUBSEP_node->var_value->stptr;
2243 subseplen = SUBSEP_node->var_value->stlen;
2245 for (ii = 0; ii < NUMSUBPATS(rp, t1->stptr); ii++) {
2247 * Loop over all the subpats; some of them may have
2248 * matched even if all of them did not.
2250 if ((s = SUBPATSTART(rp, t1->stptr, ii)) != -1) {
2251 size_t subpat_start;
2256 start = t1->stptr + s;
2258 subpat_len = len = SUBPATEND(rp, t1->stptr, ii) - s;
2260 if (len > 0 && gawk_mb_cur_max > 1) {
2261 subpat_start = wc_indices[s];
2262 subpat_len = wc_indices[s + len - 1] - subpat_start + 1;
2266 it = make_string(start, len);
2267 it->flags |= MAYBE_NUM; /* user input */
2269 sub = make_number((AWKNUM) (ii));
2270 lhs = assoc_lookup(dest, sub, FALSE);
2275 sprintf(buff, "%d", ii);
2276 ilen = strlen(buff);
2277 amt = ilen + subseplen + strlen("length") + 2;
2280 emalloc(buf, char *, amt, "do_match");
2281 } else if (amt > oldamt) {
2282 erealloc(buf, char *, amt, "do_match");
2285 memcpy(buf, buff, ilen);
2286 memcpy(buf + ilen, subsepstr, subseplen);
2287 memcpy(buf + ilen + subseplen, "start", 6);
2289 slen = ilen + subseplen + 5;
2291 it = make_number((AWKNUM) subpat_start + 1);
2292 sub = make_string(buf, slen);
2293 lhs = assoc_lookup(dest, sub, FALSE);
2298 memcpy(buf, buff, ilen);
2299 memcpy(buf + ilen, subsepstr, subseplen);
2300 memcpy(buf + ilen + subseplen, "length", 7);
2302 slen = ilen + subseplen + 6;
2304 it = make_number((AWKNUM) subpat_len);
2305 sub = make_string(buf, slen);
2306 lhs = assoc_lookup(dest, sub, FALSE);
2315 if (wc_indices != NULL)
2317 } else { /* match failed */
2323 unref(RSTART_node->var_value);
2324 RSTART_node->var_value = make_number((AWKNUM) rstart);
2325 unref(RLENGTH_node->var_value);
2326 RLENGTH_node->var_value = make_number((AWKNUM) rlength);
2327 return make_number((AWKNUM) rstart);
2330 /* do_sub --- do the work for sub, gsub, and gensub */
2333 * Gsub can be tricksy; particularly when handling the case of null strings.
2334 * The following awk code was useful in debugging problems. It is too bad
2335 * that it does not readily translate directly into the C code, below.
2337 * #! /usr/local/bin/mawk -f
2340 * TRUE = 1; FALSE = 0
2341 * print "--->", mygsub("abc", "b+", "FOO")
2342 * print "--->", mygsub("abc", "x*", "X")
2343 * print "--->", mygsub("abc", "b*", "X")
2344 * print "--->", mygsub("abc", "c", "X")
2345 * print "--->", mygsub("abc", "c+", "X")
2346 * print "--->", mygsub("abc", "x*$", "X")
2349 * function mygsub(str, regex, replace, origstr, newstr, eosflag, nonzeroflag)
2352 * eosflag = nonzeroflag = FALSE
2353 * while (match(str, regex)) {
2354 * if (RLENGTH > 0) { # easy case
2355 * nonzeroflag = TRUE
2356 * if (RSTART == 1) { # match at front of string
2357 * newstr = newstr replace
2359 * newstr = newstr substr(str, 1, RSTART-1) replace
2361 * str = substr(str, RSTART+RLENGTH)
2362 * } else if (nonzeroflag) {
2363 * # last match was non-zero in length, and at the
2364 * # current character, we get a zero length match,
2365 * # which we don't really want, so skip over it
2366 * newstr = newstr substr(str, 1, 1)
2367 * str = substr(str, 2)
2368 * nonzeroflag = FALSE
2371 * if (RSTART == 1) {
2372 * newstr = newstr replace substr(str, 1, 1)
2373 * str = substr(str, 2)
2375 * return newstr str replace
2378 * if (length(str) == 0)
2384 * if (length(str) > 0)
2385 * newstr = newstr str # rest of string
2392 * 1/2004: The gawk sub/gsub behavior dates from 1996, when we proposed it
2393 * for POSIX. The proposal fell through the cracks, and the 2001 POSIX
2394 * standard chose a more simple behavior.
2396 * The relevant text is to be found on lines 6394-6407 (pages 166, 167) of the
2399 * sub(ere, repl[, in ])
2400 * Substitute the string repl in place of the first instance of the extended regular
2401 * expression ERE in string in and return the number of substitutions. An ampersand
2402 * ('&') appearing in the string repl shall be replaced by the string from in that
2403 * matches the ERE. An ampersand preceded with a backslash ('\') shall be
2404 * interpreted as the literal ampersand character. An occurrence of two consecutive
2405 * backslashes shall be interpreted as just a single literal backslash character. Any
2406 * other occurrence of a backslash (for example, preceding any other character) shall
2407 * be treated as a literal backslash character. Note that if repl is a string literal (the
2408 * lexical token STRING; see Grammar (on page 170)), the handling of the
2409 * ampersand character occurs after any lexical processing, including any lexical
2410 * backslash escape sequence processing. If in is specified and it is not an lvalue (see
2411 * Expressions in awk (on page 156)), the behavior is undefined. If in is omitted, awk
2412 * shall use the current record ($0) in its place.
2414 * 11/2010: The text in the 2008 standard is the same as just quoted. However, POSIX behavior
2415 * is now the default. This can change the behavior of awk programs. The old behavior
2420 * NB: `howmany' conflicts with a SunOS 4.x macro in <sys/param.h>.
2424 do_sub(int nargs, unsigned int flags)
2442 NODE *s; /* subst. pattern */
2443 NODE *t; /* string to make sub. in; $0 if none given */
2446 long how_many = 1; /* one substitution for sub, also gensub default */
2449 int lastmatchnonzero;
2450 char *mb_indices = NULL;
2452 if ((flags & GENSUB) != 0) {
2457 rp = re_update(tmp);
2459 t = POP_STRING(); /* original string */
2461 t1 = POP_SCALAR(); /* value of global flag */
2462 if ((t1->flags & (STRCUR|STRING)) != 0) {
2463 if (t1->stlen > 0 && (t1->stptr[0] == 'g' || t1->stptr[0] == 'G'))
2466 d = force_number(t1);
2468 if ((t1->flags & NUMCUR) != 0)
2474 d = force_number(t1);
2478 else if (d < LONG_MAX)
2481 how_many = LONG_MAX;
2483 warning(_("gensub: third argument of 0 treated as 1"));
2489 /* take care of regexp early, in case re_update is fatal */
2492 rp = re_update(tmp);
2494 if ((flags & GSUB) != 0)
2497 /* original string */
2499 if ((flags & LITERAL) != 0)
2502 lhs = POP_ADDRESS();
2503 t = force_string(*lhs);
2507 global = (how_many == -1);
2509 s = POP_STRING(); /* replacement text */
2510 decr_sp(); /* regexp, already updated above */
2512 /* do the search early to avoid work on non-match */
2513 if (research(rp, t->stptr, 0, t->stlen, RE_NEED_START) == -1 ||
2514 RESTART(rp, t->stptr) > t->stlen)
2521 buflen = textlen + 2;
2524 replend = repl + s->stlen;
2525 repllen = replend - repl;
2526 emalloc(buf, char *, buflen + 2, "do_sub");
2528 buf[buflen + 1] = '\0';
2532 * Some systems' malloc() can't handle being called with an
2533 * argument of zero. Thus we have to have some special case
2534 * code to check for `repllen == 0'. This can occur for
2536 * sub(/foo/, "", mystring)
2539 if (gawk_mb_cur_max > 1 && repllen > 0) {
2540 emalloc(mb_indices, char *, repllen * sizeof(char), "do_sub");
2541 index_multibyte_buffer(repl, mb_indices, repllen);
2544 for (scan = repl; scan < replend; scan++) {
2545 if ((gawk_mb_cur_max == 1 || (repllen > 0 && mb_indices[scan - repl] == 1))
2546 && (*scan == '&')) {
2549 } else if (*scan == '\\') {
2550 if (flags & GENSUB) { /* gensub, behave sanely */
2551 if (isdigit((unsigned char) scan[1])) {
2554 } else { /* \q for any q --> q */
2558 } else if (do_posix) {
2559 /* \& --> &, \\ --> \ */
2560 if (scan[1] == '&' || scan[1] == '\\') {
2564 leave alone, it goes into the output */
2566 /* gawk default behavior since 1996 */
2567 if (strncmp(scan, "\\\\\\&", 4) == 0) {
2571 } else if (strncmp(scan, "\\\\&", 3) == 0) {
2572 /* \\& --> \<string> */
2576 } else if (scan[1] == '&') {
2581 leave alone, it goes into the output */
2586 lastmatchnonzero = FALSE;
2588 for (current = 1;; current++) {
2590 matchstart = t->stptr + RESTART(rp, t->stptr);
2591 matchend = t->stptr + REEND(rp, t->stptr);
2594 * create the result, copying in parts of the original
2597 len = matchstart - text + repllen
2598 + ampersands * (matchend - matchstart);
2600 while (buflen < (sofar + len + 1)) {
2602 erealloc(buf, char *, buflen, "sub_common");
2605 for (scan = text; scan < matchstart; scan++)
2607 if (global || current == how_many) {
2609 * If the current match matched the null string,
2610 * and the last match didn't and did a replacement,
2611 * and the match of the null string is at the front of
2612 * the text (meaning right after end of the previous
2613 * replacement), then skip this one.
2615 if (matchstart == matchend
2617 && matchstart == text) {
2618 lastmatchnonzero = FALSE;
2623 * If replacing all occurrences, or this is the
2624 * match we want, copy in the replacement text,
2625 * making substitutions as we go.
2627 for (scan = repl; scan < replend; scan++)
2630 * Don't test repllen here. A simple "&" could
2631 * end up with repllen == 0.
2633 && (gawk_mb_cur_max == 1
2634 || mb_indices[scan - repl] == 1)
2636 for (cp = matchstart; cp < matchend; cp++)
2638 } else if (*scan == '\\'
2639 && (gawk_mb_cur_max == 1
2640 || (repllen > 0 && mb_indices[scan - repl] == 1))
2642 if (flags & GENSUB) { /* gensub, behave sanely */
2643 if (isdigit((unsigned char) scan[1])) {
2644 int dig = scan[1] - '0';
2645 if (dig < NUMSUBPATS(rp, t->stptr) && SUBPATSTART(rp, tp->stptr, dig) != -1) {
2649 + SUBPATSTART(rp, t->stptr, dig);
2651 + SUBPATEND(rp, t->stptr, dig);
2653 for (cp = start; cp < end; cp++)
2657 } else /* \q for any q --> q */
2659 } else if (do_posix) {
2660 /* \& --> &, \\ --> \ */
2661 if (scan[1] == '&' || scan[1] == '\\')
2665 /* gawk default behavior since 1996 */
2666 if (strncmp(scan, "\\\\\\&", 4) == 0) {
2671 } else if (strncmp(scan, "\\\\&", 3) == 0) {
2672 /* \\& --> \<string> */
2674 for (cp = matchstart; cp < matchend; cp++)
2677 } else if (scan[1] == '&') {
2686 if (matchstart != matchend)
2687 lastmatchnonzero = TRUE;
2690 * don't want this match, skip over it by copying
2693 for (cp = matchstart; cp < matchend; cp++)
2697 /* catch the case of gsub(//, "blah", whatever), i.e. empty regexp */
2698 if (matchstart == matchend && matchend < text + textlen) {
2702 textlen = text + textlen - matchend;
2705 if ((current >= how_many && ! global)
2706 || ((long) textlen <= 0 && matchstart == matchend)
2707 || research(rp, t->stptr, text - t->stptr, textlen, RE_NEED_START) == -1)
2712 if (buflen - sofar - textlen - 1) {
2713 buflen = sofar + textlen + 2;
2714 erealloc(buf, char *, buflen, "do_sub");
2717 for (scan = matchend; scan < text + textlen; scan++)
2722 if (mb_indices != NULL)
2728 if ((matches == 0 || (flags & LITERAL) != 0) && buf != NULL)
2731 if (flags & GENSUB) {
2733 /* return the result string */
2735 return make_str_node(buf, textlen, ALREADY_MALLOCED);
2738 /* return the original string */
2742 /* For a string literal, must not change the original string. */
2743 if (flags & LITERAL)
2745 else if (matches > 0) {
2747 *lhs = make_str_node(buf, textlen, ALREADY_MALLOCED);
2750 return make_number((AWKNUM) matches);
2754 /* make_integer - Convert an integer to a number node. */
2757 make_integer(uintmax_t n)
2761 return make_number((AWKNUM) n);
2764 /* do_lshift --- perform a << operation */
2767 do_lshift(int nargs)
2770 uintmax_t uval, ushift, res;
2773 POP_TWO_SCALARS(s1, s2);
2775 if ((s1->flags & (NUMCUR|NUMBER)) == 0)
2776 lintwarn(_("lshift: received non-numeric first argument"));
2777 if ((s2->flags & (NUMCUR|NUMBER)) == 0)
2778 lintwarn(_("lshift: received non-numeric second argument"));
2780 val = force_number(s1);
2781 shift = force_number(s2);
2783 if (val < 0 || shift < 0)
2784 lintwarn(_("lshift(%lf, %lf): negative values will give strange results"), val, shift);
2785 if (double_to_int(val) != val || double_to_int(shift) != shift)
2786 lintwarn(_("lshift(%lf, %lf): fractional values will be truncated"), val, shift);
2787 if (shift >= sizeof(uintmax_t) * CHAR_BIT)
2788 lintwarn(_("lshift(%lf, %lf): too large shift value will give strange results"), val, shift);
2794 uval = (uintmax_t) val;
2795 ushift = (uintmax_t) shift;
2797 res = uval << ushift;
2798 return make_integer(res);
2801 /* do_rshift --- perform a >> operation */
2804 do_rshift(int nargs)
2807 uintmax_t uval, ushift, res;
2810 POP_TWO_SCALARS(s1, s2);
2812 if ((s1->flags & (NUMCUR|NUMBER)) == 0)
2813 lintwarn(_("rshift: received non-numeric first argument"));
2814 if ((s2->flags & (NUMCUR|NUMBER)) == 0)
2815 lintwarn(_("rshift: received non-numeric second argument"));
2817 val = force_number(s1);
2818 shift = force_number(s2);
2820 if (val < 0 || shift < 0)
2821 lintwarn(_("rshift(%lf, %lf): negative values will give strange results"), val, shift);
2822 if (double_to_int(val) != val || double_to_int(shift) != shift)
2823 lintwarn(_("rshift(%lf, %lf): fractional values will be truncated"), val, shift);
2824 if (shift >= sizeof(uintmax_t) * CHAR_BIT)
2825 lintwarn(_("rshift(%lf, %lf): too large shift value will give strange results"), val, shift);
2831 uval = (uintmax_t) val;
2832 ushift = (uintmax_t) shift;
2834 res = uval >> ushift;
2835 return make_integer(res);
2838 /* do_and --- perform an & operation */
2844 uintmax_t uleft, uright, res;
2847 POP_TWO_SCALARS(s1, s2);
2849 if ((s1->flags & (NUMCUR|NUMBER)) == 0)
2850 lintwarn(_("and: received non-numeric first argument"));
2851 if ((s2->flags & (NUMCUR|NUMBER)) == 0)
2852 lintwarn(_("and: received non-numeric second argument"));
2854 left = force_number(s1);
2855 right = force_number(s2);
2857 if (left < 0 || right < 0)
2858 lintwarn(_("and(%lf, %lf): negative values will give strange results"), left, right);
2859 if (double_to_int(left) != left || double_to_int(right) != right)
2860 lintwarn(_("and(%lf, %lf): fractional values will be truncated"), left, right);
2866 uleft = (uintmax_t) left;
2867 uright = (uintmax_t) right;
2869 res = uleft & uright;
2870 return make_integer(res);
2873 /* do_or --- perform an | operation */
2879 uintmax_t uleft, uright, res;
2882 POP_TWO_SCALARS(s1, s2);
2884 if ((s1->flags & (NUMCUR|NUMBER)) == 0)
2885 lintwarn(_("or: received non-numeric first argument"));
2886 if ((s2->flags & (NUMCUR|NUMBER)) == 0)
2887 lintwarn(_("or: received non-numeric second argument"));
2889 left = force_number(s1);
2890 right = force_number(s2);
2892 if (left < 0 || right < 0)
2893 lintwarn(_("or(%lf, %lf): negative values will give strange results"), left, right);
2894 if (double_to_int(left) != left || double_to_int(right) != right)
2895 lintwarn(_("or(%lf, %lf): fractional values will be truncated"), left, right);
2901 uleft = (uintmax_t) left;
2902 uright = (uintmax_t) right;
2904 res = uleft | uright;
2905 return make_integer(res);
2908 /* do_xor --- perform an ^ operation */
2914 uintmax_t uleft, uright, res;
2917 POP_TWO_SCALARS(s1, s2);
2918 left = force_number(s1);
2919 right = force_number(s2);
2922 if ((s1->flags & (NUMCUR|NUMBER)) == 0)
2923 lintwarn(_("xor: received non-numeric first argument"));
2924 if ((s2->flags & (NUMCUR|NUMBER)) == 0)
2925 lintwarn(_("xor: received non-numeric second argument"));
2927 left = force_number(s1);
2928 right = force_number(s2);
2930 if (left < 0 || right < 0)
2931 lintwarn(_("xor(%lf, %lf): negative values will give strange results"), left, right);
2932 if (double_to_int(left) != left || double_to_int(right) != right)
2933 lintwarn(_("xor(%lf, %lf): fractional values will be truncated"), left, right);
2939 uleft = (uintmax_t) left;
2940 uright = (uintmax_t) right;
2942 res = uleft ^ uright;
2943 return make_integer(res);
2946 /* do_compl --- perform a ~ operation */
2956 if (do_lint && (tmp->flags & (NUMCUR|NUMBER)) == 0)
2957 lintwarn(_("compl: received non-numeric argument"));
2958 d = force_number(tmp);
2962 if ((tmp->flags & (NUMCUR|NUMBER)) == 0)
2963 lintwarn(_("compl: received non-numeric argument"));
2965 lintwarn(_("compl(%lf): negative value will give strange results"), d);
2966 if (double_to_int(d) != d)
2967 lintwarn(_("compl(%lf): fractional value will be truncated"), d);
2970 uval = (uintmax_t) d;
2972 return make_integer(uval);
2975 /* do_strtonum --- the strtonum function */
2978 do_strtonum(int nargs)
2984 if ((tmp->flags & (NUMBER|NUMCUR)) != 0)
2985 d = (AWKNUM) force_number(tmp);
2986 else if (isnondecimal(tmp->stptr, use_lc_numeric))
2987 d = nondec2awknum(tmp->stptr, tmp->stlen);
2989 d = (AWKNUM) force_number(tmp);
2992 return make_number((AWKNUM) d);
2995 /* nondec2awknum --- convert octal or hex value to double */
2998 * Because of awk's concatenation rules and the way awk.y:yylex()
2999 * collects a number, this routine has to be willing to stop on the
3000 * first invalid character.
3004 nondec2awknum(char *str, size_t len)
3006 AWKNUM retval = 0.0;
3011 if (*str == '0' && (str[1] == 'x' || str[1] == 'X')) {
3013 * User called strtonum("0x") or some such,
3014 * so just quit early.
3017 return (AWKNUM) 0.0;
3019 for (str += 2, len -= 2; len > 0; len--, str++) {
3039 val = *str - 'a' + 10;
3047 val = *str - 'A' + 10;
3052 retval = (retval * 16) + val;
3054 } else if (*str == '0') {
3055 for (; len > 0; len--) {
3056 if (! isdigit((unsigned char) *str))
3058 else if (*str == '8' || *str == '9') {
3062 retval = (retval * 8) + (*str - '0');
3068 retval = strtod(str, NULL);
3075 /* do_dcgettext, do_dcngettext --- handle i18n translations */
3077 #if ENABLE_NLS && defined(LC_MESSAGES) && HAVE_DCGETTEXT
3080 localecategory_from_argument(NODE *t)
3082 static const struct category_table {
3087 { LC_ALL, "LC_ALL" },
3090 { LC_COLLATE, "LC_COLLATE" },
3091 #endif /* LC_COLLATE */
3093 { LC_CTYPE, "LC_CTYPE" },
3094 #endif /* LC_CTYPE */
3096 { LC_MESSAGES, "LC_MESSAGES" },
3097 #endif /* LC_MESSAGES */
3099 { LC_MONETARY, "LC_MONETARY" },
3100 #endif /* LC_MONETARY */
3102 { LC_NUMERIC, "LC_NUMERIC" },
3103 #endif /* LC_NUMERIC */
3105 { LC_RESPONSE, "LC_RESPONSE" },
3106 #endif /* LC_RESPONSE */
3108 { LC_TIME, "LC_TIME" },
3109 #endif /* LC_TIME */
3113 int low, high, i, mid;
3117 category = t->stptr;
3119 /* binary search the table */
3121 high = (sizeof(cat_tab) / sizeof(cat_tab[0])) - 1;
3122 while (low <= high) {
3123 mid = (low + high) / 2;
3124 i = strcmp(category, cat_tab[mid].name);
3126 if (i < 0) /* category < mid */
3128 else if (i > 0) /* category > mid */
3131 lc_cat = cat_tab[mid].val;
3135 if (lc_cat == -1) /* not there */
3136 fatal(_("dcgettext: `%s' is not a valid locale category"), category);
3148 * str = dcgettext(string [, domain [, category]])
3149 * str = dcngettext(string1, string2, number [, domain [, category]])
3151 * Default domain is TEXTDOMAIN, default category is LC_MESSAGES.
3155 do_dcgettext(int nargs)
3157 NODE *tmp, *t1, *t2 = NULL;
3160 #if ENABLE_NLS && defined(LC_MESSAGES) && HAVE_DCGETTEXT
3164 if (nargs == 3) { /* third argument */
3166 lc_cat = localecategory_from_argument(tmp);
3169 lc_cat = LC_MESSAGES;
3171 if (nargs >= 2) { /* second argument */
3175 domain = TEXTDOMAIN;
3187 t1 = POP_STRING(); /* first argument */
3190 #if ENABLE_NLS && defined(LC_MESSAGES) && HAVE_DCGETTEXT
3191 the_result = dcgettext(domain, string, lc_cat);
3195 the_result = string;
3198 return make_string(the_result, strlen(the_result));
3203 do_dcngettext(int nargs)
3205 NODE *tmp, *t1, *t2, *t3;
3206 char *string1, *string2;
3207 unsigned long number;
3211 #if ENABLE_NLS && defined(LC_MESSAGES) && HAVE_DCGETTEXT
3215 if (nargs == 5) { /* fifth argument */
3217 lc_cat = localecategory_from_argument(tmp);
3220 lc_cat = LC_MESSAGES;
3223 if (nargs >= 4) { /* fourth argument */
3227 domain = TEXTDOMAIN;
3239 POP_NUMBER(d); /* third argument */
3240 number = (unsigned long) double_to_int(d);
3241 t2 = POP_STRING(); /* second argument */
3242 string2 = t2->stptr;
3243 t1 = POP_STRING(); /* first argument */
3244 string1 = t1->stptr;
3246 #if ENABLE_NLS && defined(LC_MESSAGES) && HAVE_DCGETTEXT
3248 the_result = dcngettext(domain, string1, string2, number, lc_cat);
3252 the_result = (number == 1 ? string1 : string2);
3256 return make_string(the_result, strlen(the_result));
3259 /* do_bindtextdomain --- set the directory for a text domain */
3264 * binding = bindtextdomain(dir [, domain])
3266 * If dir is "", pass NULL to C version.
3267 * Default domain is TEXTDOMAIN.
3271 do_bindtextdomain(int nargs)
3274 const char *directory, *domain;
3275 const char *the_result;
3280 domain = TEXTDOMAIN;
3282 if (nargs == 2) { /* second argument */
3284 domain = (const char *) t2->stptr;
3287 /* first argument */
3290 directory = (const char *) t1->stptr;
3292 the_result = bindtextdomain(domain, directory);
3298 return make_string(the_result, strlen(the_result));
3302 /* mbc_byte_count --- return number of bytes for corresponding numchars multibyte characters */
3305 mbc_byte_count(const char *ptr, size_t numchars)
3308 mbstate_t cur_state;
3312 memset(& cur_state, 0, sizeof(cur_state));
3314 assert(gawk_mb_cur_max > 1);
3315 mb_len = mbrlen(ptr, numchars * gawk_mb_cur_max, &cur_state);
3317 return numchars; /* no valid m.b. char */
3319 for (; numchars > 0; numchars--) {
3320 mb_len = mbrlen(ptr, numchars * gawk_mb_cur_max, &cur_state);
3333 /* mbc_char_count --- return number of m.b. chars in string, up to numbytes bytes */
3336 mbc_char_count(const char *ptr, size_t numbytes)
3339 mbstate_t cur_state;
3343 if (gawk_mb_cur_max == 1)
3346 memset(& cur_state, 0, sizeof(cur_state));
3348 mb_len = mbrlen(ptr, numbytes * gawk_mb_cur_max, &cur_state);
3350 return numbytes; /* no valid m.b. char */
3352 for (; numbytes > 0; numbytes--) {
3353 mb_len = mbrlen(ptr, numbytes * gawk_mb_cur_max, &cur_state);