* Licensed under GPLv2 or later, see file LICENSE in this source tree.
*/
+//config:config AWK
+//config: bool "awk"
+//config: default y
+//config: help
+//config: Awk is used as a pattern scanning and processing language. This is
+//config: the BusyBox implementation of that programming language.
+//config:
+//config:config FEATURE_AWK_LIBM
+//config: bool "Enable math functions (requires libm)"
+//config: default y
+//config: depends on AWK
+//config: help
+//config: Enable math functions of the Awk programming language.
+//config: NOTE: This will require libm to be present for linking.
+//config:
+//config:config FEATURE_AWK_GNU_EXTENSIONS
+//config: bool "Enable a few GNU extensions"
+//config: default y
+//config: depends on AWK
+//config: help
+//config: Enable a few features from gawk:
+//config: * command line option -e AWK_PROGRAM
+//config: * simultaneous use of -f and -e on the command line.
+//config: This enables the use of awk library files.
+//config: Ex: awk -f mylib.awk -e '{print myfunction($1);}' ...
+
+//applet:IF_AWK(APPLET_NOEXEC(awk, awk, BB_DIR_USR_BIN, BB_SUID_DROP, awk))
+
+//kbuild:lib-$(CONFIG_AWK) += awk.o
+
//usage:#define awk_trivial_usage
//usage: "[OPTIONS] [AWK_PROGRAM] [FILE]..."
//usage:#define awk_full_usage "\n\n"
//usage: " -v VAR=VAL Set variable"
//usage: "\n -F SEP Use SEP as field separator"
//usage: "\n -f FILE Read program from FILE"
+//usage: IF_FEATURE_AWK_GNU_EXTENSIONS(
+//usage: "\n -e AWK_PROGRAM"
+//usage: )
#include "libbb.h"
#include "xregex.h"
* to perform debug printfs to stderr: */
#define debug_printf_walker(...) do {} while (0)
#define debug_printf_eval(...) do {} while (0)
+#define debug_printf_parse(...) do {} while (0)
#ifndef debug_printf_walker
# define debug_printf_walker(...) (fprintf(stderr, __VA_ARGS__))
#ifndef debug_printf_eval
# define debug_printf_eval(...) (fprintf(stderr, __VA_ARGS__))
#endif
+#ifndef debug_printf_parse
+# define debug_printf_parse(...) (fprintf(stderr, __VA_ARGS__))
+#endif
+#define OPTSTR_AWK \
+ "F:v:f:" \
+ IF_FEATURE_AWK_GNU_EXTENSIONS("e:") \
+ "W:"
+#define OPTCOMPLSTR_AWK \
+ "v::f::" \
+ IF_FEATURE_AWK_GNU_EXTENSIONS("e::")
+enum {
+ OPTBIT_F, /* define field separator */
+ OPTBIT_v, /* define variable */
+ OPTBIT_f, /* pull in awk program from file */
+ IF_FEATURE_AWK_GNU_EXTENSIONS(OPTBIT_e,) /* -e AWK_PROGRAM */
+ OPTBIT_W, /* -W ignored */
+ OPT_F = 1 << OPTBIT_F,
+ OPT_v = 1 << OPTBIT_v,
+ OPT_f = 1 << OPTBIT_f,
+ OPT_e = IF_FEATURE_AWK_GNU_EXTENSIONS((1 << OPTBIT_e)) + 0,
+ OPT_W = 1 << OPTBIT_W
+};
#define MAXVARFMT 240
#define MINNVBLOCK 64
/* simple token classes */
/* Order and hex values are very important!!! See next_token() */
-#define TC_SEQSTART 1 /* ( */
+#define TC_SEQSTART 1 /* ( */
#define TC_SEQTERM (1 << 1) /* ) */
#define TC_REGEXP (1 << 2) /* /.../ */
#define TC_OUTRDR (1 << 3) /* | > >> */
/* combined token classes */
#define TC_BINOP (TC_BINOPX | TC_COMMA | TC_PIPE | TC_IN)
-#define TC_UNARYOP (TC_UOPPRE | TC_UOPPOST)
+//#define TC_UNARYOP (TC_UOPPRE | TC_UOPPOST)
#define TC_OPERAND (TC_VARIABLE | TC_ARRAY | TC_FUNCTION \
| TC_BUILTIN | TC_GETLINE | TC_SEQSTART | TC_STRING | TC_NUMBER)
* For builtins it has different meaning: n n s3 s2 s1 v3 v2 v1,
* n - min. number of args, vN - resolve Nth arg to var, sN - resolve to string
*/
+#undef P
+#undef PRIMASK
+#undef PRIMASK2
#define P(x) (x << 24)
#define PRIMASK 0x7F000000
#define PRIMASK2 0x7E000000
smallint nextrec;
smallint nextfile;
smallint is_f0_split;
+ smallint t_rollback;
};
struct globals2 {
uint32_t t_info; /* often used */
uint32_t t_tclass;
char *t_string;
int t_lineno;
- int t_rollback;
var *intvar[NUM_INTERNAL_VARS]; /* often used */
#define nextrec (G1.nextrec )
#define nextfile (G1.nextfile )
#define is_f0_split (G1.is_f0_split )
+#define t_rollback (G1.t_rollback )
#define t_info (G.t_info )
#define t_tclass (G.t_tclass )
#define t_string (G.t_string )
#define t_lineno (G.t_lineno )
-#define t_rollback (G.t_rollback )
#define intvar (G.intvar )
#define fsplitter (G.fsplitter )
#define rsplitter (G.rsplitter )
pps = *s;
if (c == '\\')
c = bb_process_escape_sequence((const char**)s);
+ /* Example awk statement:
+ * s = "abc\"def"
+ * we must treat \" as "
+ */
if (c == '\\' && *s == pps) { /* unrecognized \z? */
c = *(*s); /* yes, fetch z */
if (c)
return c;
}
+/* TODO: merge with strcpy_and_process_escape_sequences()?
+ */
+static void unescape_string_in_place(char *s1)
+{
+ char *s = s1;
+ while ((*s1 = nextchar(&s)) != '\0')
+ s1++;
+}
+
static ALWAYS_INLINE int isalnum_(int c)
{
return (isalnum(c) || c == '_');
if (*p == '\0') {
tc = TC_EOF;
+ debug_printf_parse("%s: token found: TC_EOF\n", __func__);
} else if (*p == '\"') {
/* it's a string */
p++;
*s = '\0';
tc = TC_STRING;
+ debug_printf_parse("%s: token found:'%s' TC_STRING\n", __func__, t_string);
} else if ((expected & TC_REGEXP) && *p == '/') {
/* it's regexp */
p++;
*s = '\0';
tc = TC_REGEXP;
+ debug_printf_parse("%s: token found:'%s' TC_REGEXP\n", __func__, t_string);
} else if (*p == '.' || isdigit(*p)) {
/* it's a number */
if (*p == '.')
syntax_error(EMSG_UNEXP_TOKEN);
tc = TC_NUMBER;
+ debug_printf_parse("%s: token found:%f TC_NUMBER\n", __func__, t_double);
} else {
/* search for something known */
) {
/* then this is what we are looking for */
t_info = *ti;
+ debug_printf_parse("%s: token found:'%.*s' t_info:%x\n", __func__, l, p, t_info);
p += l;
goto token_found;
}
p = skip_spaces(p);
if (*p == '(') {
tc = TC_FUNCTION;
+ debug_printf_parse("%s: token found:'%s' TC_FUNCTION\n", __func__, t_string);
} else {
if (*p == '[') {
p++;
tc = TC_ARRAY;
- }
+ debug_printf_parse("%s: token found:'%s' TC_ARRAY\n", __func__, t_string);
+ } else
+ debug_printf_parse("%s: token found:'%s' TC_VARIABLE\n", __func__, t_string);
}
- token_found: ;
}
+ token_found:
g_pos = p;
/* skipping newlines in some cases */
uint32_t tc, xtc;
var *v;
+ debug_printf_parse("%s(%x)\n", __func__, iexp);
+
sn.info = PRIMASK;
sn.r.n = glptr = NULL;
xtc = TC_OPERAND | TC_UOPPRE | TC_REGEXP | iexp;
if (glptr && (t_info == (OC_COMPARE | VV | P(39) | 2))) {
/* input redirection (<) attached to glptr node */
+ debug_printf_parse("%s: input redir\n", __func__);
cn = glptr->l.n = new_node(OC_CONCAT | SS | P(37));
cn->a.n = glptr;
xtc = TC_OPERAND | TC_UOPPRE;
glptr = NULL;
} else if (tc & (TC_BINOP | TC_UOPPOST)) {
+ debug_printf_parse("%s: TC_BINOP | TC_UOPPOST\n", __func__);
/* for binary and postfix-unary operators, jump back over
* previous operators with higher priority */
vn = cn;
vn->a.n = cn;
} else {
+ debug_printf_parse("%s: other\n", __func__);
/* for operands and prefix-unary operators, attach them
* to last node */
vn = cn;
cn->a.n = vn;
xtc = TC_OPERAND | TC_UOPPRE | TC_REGEXP;
if (tc & (TC_OPERAND | TC_REGEXP)) {
+ debug_printf_parse("%s: TC_OPERAND | TC_REGEXP\n", __func__);
xtc = TC_UOPPRE | TC_UOPPOST | TC_BINOP | TC_OPERAND | iexp;
/* one should be very careful with switch on tclass -
* only simple tclasses should be used! */
switch (tc) {
case TC_VARIABLE:
case TC_ARRAY:
+ debug_printf_parse("%s: TC_VARIABLE | TC_ARRAY\n", __func__);
cn->info = OC_VAR;
v = hash_search(ahash, t_string);
if (v != NULL) {
case TC_NUMBER:
case TC_STRING:
+ debug_printf_parse("%s: TC_NUMBER | TC_STRING\n", __func__);
cn->info = OC_VAR;
v = cn->l.v = xzalloc(sizeof(var));
if (tc & TC_NUMBER)
break;
case TC_REGEXP:
+ debug_printf_parse("%s: TC_REGEXP\n", __func__);
mk_re_node(t_string, cn, xzalloc(sizeof(regex_t)*2));
break;
case TC_FUNCTION:
+ debug_printf_parse("%s: TC_FUNCTION\n", __func__);
cn->info = OC_FUNC;
cn->r.f = newfunc(t_string);
cn->l.n = condition();
break;
case TC_SEQSTART:
+ debug_printf_parse("%s: TC_SEQSTART\n", __func__);
cn = vn->r.n = parse_expr(TC_SEQTERM);
+ if (!cn)
+ syntax_error("Empty sequence");
cn->a.n = vn;
break;
case TC_GETLINE:
+ debug_printf_parse("%s: TC_GETLINE\n", __func__);
glptr = cn;
xtc = TC_OPERAND | TC_UOPPRE | TC_BINOP | iexp;
break;
case TC_BUILTIN:
+ debug_printf_parse("%s: TC_BUILTIN\n", __func__);
cn->l.n = condition();
break;
}
}
}
}
+
+ debug_printf_parse("%s() returns %p\n", __func__, sn.r.n);
return sn.r.n;
}
} while (c & TC_NEWLINE);
if (c & TC_GRPSTART) {
+ debug_printf_parse("%s: TC_GRPSTART\n", __func__);
while (next_token(TC_GRPSEQ | TC_GRPTERM) != TC_GRPTERM) {
+ debug_printf_parse("%s: !TC_GRPTERM\n", __func__);
if (t_tclass & TC_NEWLINE)
continue;
rollback_token();
chain_group();
}
+ debug_printf_parse("%s: TC_GRPTERM\n", __func__);
} else if (c & (TC_OPSEQ | TC_OPTERM)) {
+ debug_printf_parse("%s: TC_OPSEQ | TC_OPTERM\n", __func__);
rollback_token();
chain_expr(OC_EXEC | Vx);
- } else { /* TC_STATEMNT */
+ } else {
+ /* TC_STATEMNT */
+ debug_printf_parse("%s: TC_STATEMNT(?)\n", __func__);
switch (t_info & OPCLSMASK) {
case ST_IF:
+ debug_printf_parse("%s: ST_IF\n", __func__);
n = chain_node(OC_BR | Vx);
n->l.n = condition();
chain_group();
break;
case ST_WHILE:
+ debug_printf_parse("%s: ST_WHILE\n", __func__);
n2 = condition();
n = chain_loop(NULL);
n->l.n = n2;
break;
case ST_DO:
+ debug_printf_parse("%s: ST_DO\n", __func__);
n2 = chain_node(OC_EXEC);
n = chain_loop(NULL);
n2->a.n = n->a.n;
break;
case ST_FOR:
+ debug_printf_parse("%s: ST_FOR\n", __func__);
next_token(TC_SEQSTART);
n2 = parse_expr(TC_SEMICOL | TC_SEQTERM);
if (t_tclass & TC_SEQTERM) { /* for-in */
case OC_PRINT:
case OC_PRINTF:
+ debug_printf_parse("%s: OC_PRINT[F]\n", __func__);
n = chain_node(t_info);
n->l.n = parse_expr(TC_OPTERM | TC_OUTRDR | TC_GRPTERM);
if (t_tclass & TC_OUTRDR) {
break;
case OC_BREAK:
+ debug_printf_parse("%s: OC_BREAK\n", __func__);
n = chain_node(OC_EXEC);
n->a.n = break_ptr;
break;
case OC_CONTINUE:
+ debug_printf_parse("%s: OC_CONTINUE\n", __func__);
n = chain_node(OC_EXEC);
n->a.n = continue_ptr;
break;
/* delete, next, nextfile, return, exit */
default:
+ debug_printf_parse("%s: default\n", __func__);
chain_expr(t_info);
}
}
while ((tclass = next_token(TC_EOF | TC_OPSEQ | TC_GRPSTART |
TC_OPTERM | TC_BEGIN | TC_END | TC_FUNCDECL)) != TC_EOF) {
- if (tclass & TC_OPTERM)
+ if (tclass & TC_OPTERM) {
+ debug_printf_parse("%s: TC_OPTERM\n", __func__);
continue;
+ }
seq = &mainseq;
if (tclass & TC_BEGIN) {
+ debug_printf_parse("%s: TC_BEGIN\n", __func__);
seq = &beginseq;
chain_group();
} else if (tclass & TC_END) {
+ debug_printf_parse("%s: TC_END\n", __func__);
seq = &endseq;
chain_group();
} else if (tclass & TC_FUNCDECL) {
+ debug_printf_parse("%s: TC_FUNCDECL\n", __func__);
next_token(TC_FUNCTION);
g_pos++;
f = newfunc(t_string);
clear_array(ahash);
} else if (tclass & TC_OPSEQ) {
+ debug_printf_parse("%s: TC_OPSEQ\n", __func__);
rollback_token();
cn = chain_node(OC_TEST);
cn->l.n = parse_expr(TC_OPTERM | TC_EOF | TC_GRPSTART);
if (t_tclass & TC_GRPSTART) {
+ debug_printf_parse("%s: TC_GRPSTART\n", __func__);
rollback_token();
chain_group();
} else {
+ debug_printf_parse("%s: !TC_GRPSTART\n", __func__);
chain_node(OC_PRINT);
}
cn->r.n = mainseq.last;
} else /* if (tclass & TC_GRPSTART) */ {
+ debug_printf_parse("%s: TC_GRPSTART(?)\n", __func__);
rollback_token();
chain_group();
}
}
+ debug_printf_parse("%s: TC_EOF\n", __func__);
}
is_f0_split = FALSE;
} else if (v == intvar[FS]) {
+ /*
+ * The POSIX-2008 standard says that changing FS should have no effect on the
+ * current input line, but only on the next one. The language is:
+ *
+ * > Before the first reference to a field in the record is evaluated, the record
+ * > shall be split into fields, according to the rules in Regular Expressions,
+ * > using the value of FS that was current at the time the record was read.
+ *
+ * So, split up current line before assignment to FS:
+ */
+ split_f0();
+
mk_splitter(getvar_s(v), &fsplitter);
} else if (v == intvar[RS]) {
char c;
const char *s = format;
- if (int_as_int && n == (int)n) {
- r = snprintf(b, size, "%d", (int)n);
+ if (int_as_int && n == (long long)n) {
+ r = snprintf(b, size, "%lld", (long long)n);
} else {
do { c = *s; } while (c && *++s);
if (strchr("diouxX", c)) {
var *vbeg, *v;
const char *sv_progname;
- if (!op->r.f->body.first)
+ /* The body might be empty, still has to eval the args */
+ if (!op->r.n->info && !op->r.f->body.first)
syntax_error(EMSG_UNDEF_FUNC);
vbeg = v = nvalloc(op->r.f->nargs + 1);
rsm = iF;
}
- if (!rsm->F) {
+ if (!rsm || !rsm->F) {
setvar_i(intvar[ERRNO], errno);
setvar_i(res, -1);
break;
switch (opn) {
case F_in:
- R_d = (int)L_d;
+ R_d = (long long)L_d;
break;
case F_rn:
break;
case F_le:
- if (!op1)
+ debug_printf_eval("length: L.s:'%s'\n", L.s);
+ if (!op1) {
L.s = getvar_s(intvar[F0]);
+ debug_printf_eval("length: L.s='%s'\n", L.s);
+ }
+ else if (L.v->type & VF_ARRAY) {
+ R_d = L.v->x.array->nel;
+ debug_printf_eval("length: array_len:%d\n", L.v->x.array->nel);
+ break;
+ }
R_d = strlen(L.s);
break;
case '%':
if (R_d == 0)
syntax_error(EMSG_DIV_BY_ZERO);
- L_d -= (int)(L_d / R_d) * R_d;
+ L_d -= (long long)(L_d / R_d) * R_d;
break;
}
debug_printf_eval("BINARY/REPLACE result:%f\n", L_d);
* otherwise return 0 */
static int is_assignment(const char *expr)
{
- char *exprc, *val, *s, *s1;
+ char *exprc, *val;
if (!isalnum_(*expr) || (val = strchr(expr, '=')) == NULL) {
return FALSE;
val = exprc + (val - expr);
*val++ = '\0';
- s = s1 = val;
- while ((*s1 = nextchar(&s)) != '\0')
- s1++;
-
+ unescape_string_in_place(val);
setvar_u(newvar(exprc), val);
free(exprc);
return TRUE;
#define rsm (G.next_input_file__rsm)
#define files_happen (G.next_input_file__files_happen)
- FILE *F = NULL;
+ FILE *F;
const char *fname, *ind;
if (rsm.F)
rsm.F = NULL;
rsm.pos = rsm.adv = 0;
- do {
+ for (;;) {
if (getvar_i(intvar[ARGIND])+1 >= getvar_i(intvar[ARGC])) {
if (files_happen)
return NULL;
fname = "-";
F = stdin;
- } else {
- ind = getvar_s(incvar(intvar[ARGIND]));
- fname = getvar_s(findvar(iamarray(intvar[ARGV]), ind));
- if (fname && *fname && !is_assignment(fname))
- F = xfopen_stdin(fname);
+ break;
+ }
+ ind = getvar_s(incvar(intvar[ARGIND]));
+ fname = getvar_s(findvar(iamarray(intvar[ARGV]), ind));
+ if (fname && *fname && !is_assignment(fname)) {
+ F = xfopen_stdin(fname);
+ break;
}
- } while (!F);
+ }
files_happen = TRUE;
setvar_s(intvar[FILENAME], fname);
int awk_main(int argc, char **argv)
{
unsigned opt;
- char *opt_F, *opt_W;
+ char *opt_F;
llist_t *list_v = NULL;
llist_t *list_f = NULL;
+#if ENABLE_FEATURE_AWK_GNU_EXTENSIONS
+ llist_t *list_e = NULL;
+#endif
int i, j;
var *v;
var tv;
*s1 = '=';
}
}
- opt_complementary = "v::f::"; /* -v and -f can occur multiple times */
- opt = getopt32(argv, "F:v:f:W:", &opt_F, &list_v, &list_f, &opt_W);
+ opt_complementary = OPTCOMPLSTR_AWK;
+ opt = getopt32(argv, OPTSTR_AWK, &opt_F, &list_v, &list_f, IF_FEATURE_AWK_GNU_EXTENSIONS(&list_e,) NULL);
argv += optind;
argc -= optind;
- if (opt & 0x1)
- setvar_s(intvar[FS], opt_F); // -F
- while (list_v) { /* -v */
+ if (opt & OPT_W)
+ bb_error_msg("warning: option -W is ignored");
+ if (opt & OPT_F) {
+ unescape_string_in_place(opt_F);
+ setvar_s(intvar[FS], opt_F);
+ }
+ while (list_v) {
if (!is_assignment(llist_pop(&list_v)))
bb_show_usage();
}
- if (list_f) { /* -f */
- do {
- char *s = NULL;
- FILE *from_file;
-
- g_progname = llist_pop(&list_f);
- from_file = xfopen_stdin(g_progname);
- /* one byte is reserved for some trick in next_token */
- for (i = j = 1; j > 0; i += j) {
- s = xrealloc(s, i + 4096);
- j = fread(s + i, 1, 4094, from_file);
- }
- s[i] = '\0';
- fclose(from_file);
- parse_program(s + 1);
- free(s);
- } while (list_f);
- argc++;
- } else { // no -f: take program from 1st parameter
- if (!argc)
+ while (list_f) {
+ char *s = NULL;
+ FILE *from_file;
+
+ g_progname = llist_pop(&list_f);
+ from_file = xfopen_stdin(g_progname);
+ /* one byte is reserved for some trick in next_token */
+ for (i = j = 1; j > 0; i += j) {
+ s = xrealloc(s, i + 4096);
+ j = fread(s + i, 1, 4094, from_file);
+ }
+ s[i] = '\0';
+ fclose(from_file);
+ parse_program(s + 1);
+ free(s);
+ }
+ g_progname = "cmd. line";
+#if ENABLE_FEATURE_AWK_GNU_EXTENSIONS
+ while (list_e) {
+ parse_program(llist_pop(&list_e));
+ }
+#endif
+ if (!(opt & (OPT_f | OPT_e))) {
+ if (!*argv)
bb_show_usage();
- g_progname = "cmd. line";
parse_program(*argv++);
+ argc--;
}
- if (opt & 0x8) // -W
- bb_error_msg("warning: unrecognized option '-W %s' ignored", opt_W);
/* fill in ARGV array */
- setvar_i(intvar[ARGC], argc);
+ setvar_i(intvar[ARGC], argc + 1);
setari_u(intvar[ARGV], 0, "awk");
i = 0;
while (*argv)