grep: option to use GNU regex matching instead of POSIX one.
authorDenis Vlasenko <vda.linux@googlemail.com>
Sat, 9 Aug 2008 16:15:14 +0000 (16:15 -0000)
committerDenis Vlasenko <vda.linux@googlemail.com>
Sat, 9 Aug 2008 16:15:14 +0000 (16:15 -0000)
 This fixes problems with NULs in files being scanned, but
 costs +800 bytes. The same can be done to sed (TODO).

Config.in
findutils/grep.c
libbb/get_line_from_file.c
libbb/xregcomp.c
testsuite/grep.tests

index 5ad35ce..c2005c7 100644 (file)
--- a/Config.in
+++ b/Config.in
@@ -21,6 +21,15 @@ config DESKTOP
          Select this only if you plan to use busybox on full-blown
          desktop machine with common Linux distro, not on an embedded box.
 
+config EXTRA_COMPAT
+       bool "Provide compatible behavior for rare corner cases (bigger code)"
+       default n
+       help
+         This option makes grep, sed etc handle rare corner cases
+         (embedded NUL bytes and such). This makes code bigger and uses
+         some GNU extensions in libc. You probably only need this option
+         if you plan to run busybox on desktop.
+
 config FEATURE_ASSUME_UNICODE
        bool "Assume that 1:1 char/glyph correspondence is not true"
        default n
index 030e624..f2ed01e 100644 (file)
@@ -96,6 +96,7 @@ struct globals {
        int lines_before;
        int lines_after;
        char **before_buf;
+       USE_EXTRA_COMPAT(size_t *before_buf_size;)
        int last_line_printed;
 #endif
        /* globals used internally */
@@ -117,6 +118,7 @@ struct globals {
 #define lines_before      (G.lines_before        )
 #define lines_after       (G.lines_after         )
 #define before_buf        (G.before_buf          )
+#define before_buf_size   (G.before_buf_size     )
 #define last_line_printed (G.last_line_printed   )
 #define pattern_head      (G.pattern_head        )
 #define cur_file          (G.cur_file            )
@@ -124,14 +126,24 @@ struct globals {
 
 typedef struct grep_list_data_t {
        char *pattern;
-       regex_t preg;
+/* for GNU regex, matched_range must be persistent across grep_file() calls */
+#if !ENABLE_EXTRA_COMPAT
+       regex_t compiled_regex;
+       regmatch_t matched_range;
+#else
+       struct re_pattern_buffer compiled_regex;
+       struct re_registers matched_range;
+#endif
 #define ALLOCATED 1
 #define COMPILED 2
        int flg_mem_alocated_compiled;
 } grep_list_data_t;
 
-
-static void print_line(const char *line, int linenum, char decoration)
+#if !ENABLE_EXTRA_COMPAT
+#define print_line(line, line_len, linenum, decoration) \
+       print_line(line, linenum, decoration)
+#endif
+static void print_line(const char *line, size_t line_len, int linenum, char decoration)
 {
 #if ENABLE_FEATURE_GREP_CONTEXT
        /* Happens when we go to next file, immediately hit match
@@ -139,8 +151,9 @@ static void print_line(const char *line, int linenum, char decoration)
        if (linenum < 1)
                return;
        /* possibly print the little '--' separator */
-       if ((lines_before || lines_after) && did_print_line &&
-                       last_line_printed != linenum - 1) {
+       if ((lines_before || lines_after) && did_print_line
+        && last_line_printed != linenum - 1
+       ) {
                puts("--");
        }
        /* guard against printing "--" before first line of first file */
@@ -152,17 +165,50 @@ static void print_line(const char *line, int linenum, char decoration)
        if (PRINT_LINE_NUM)
                printf("%i%c", linenum, decoration);
        /* Emulate weird GNU grep behavior with -ov */
-       if ((option_mask32 & (OPT_v|OPT_o)) != (OPT_v|OPT_o))
+       if ((option_mask32 & (OPT_v|OPT_o)) != (OPT_v|OPT_o)) {
+#if !ENABLE_EXTRA_COMPAT
                puts(line);
+#else
+               fwrite(line, 1, line_len, stdout);
+               putchar('\n');
+#endif
+       }
 }
 
-static int grep_file(FILE *file)
+#if ENABLE_EXTRA_COMPAT
+/* Unlike getline, this one removes trailing '\n' */
+static ssize_t FAST_FUNC bb_getline(char **line_ptr, size_t *line_alloc_len, FILE *file)
 {
+       ssize_t res_sz;
        char *line;
+
+       res_sz = getline(line_ptr, line_alloc_len, file);
+       line = *line_ptr;
+
+       if (res_sz > 0) {
+               if (line[res_sz - 1] == '\n')
+                       line[--res_sz] = '\0';
+       } else {
+               free(line); /* uclibc allocates a buffer even on EOF. WTF? */
+       }
+       return res_sz;
+}
+#endif
+
+static int grep_file(FILE *file)
+{
        smalluint found;
        int linenum = 0;
        int nmatches = 0;
-       regmatch_t regmatch;
+#if !ENABLE_EXTRA_COMPAT
+       char *line;
+#else
+       char *line = NULL;
+       ssize_t line_len;
+       size_t line_alloc_len;
+#define rm_so start[0]
+#define rm_eo end[0]
+#endif
 #if ENABLE_FEATURE_GREP_CONTEXT
        int print_n_lines_after = 0;
        int curpos = 0; /* track where we are in the circular 'before' buffer */
@@ -171,7 +217,13 @@ static int grep_file(FILE *file)
        enum { print_n_lines_after = 0 };
 #endif /* ENABLE_FEATURE_GREP_CONTEXT */
 
-       while ((line = xmalloc_fgetline(file)) != NULL) {
+       while (
+#if !ENABLE_EXTRA_COMPAT
+               (line = xmalloc_fgetline(file)) != NULL
+#else
+               (line_len = bb_getline(&line, &line_alloc_len, file)) >= 0
+#endif
+       ) {
                llist_t *pattern_ptr = pattern_head;
                grep_list_data_t *gl = gl; /* for gcc */
 
@@ -184,19 +236,35 @@ static int grep_file(FILE *file)
                        } else {
                                if (!(gl->flg_mem_alocated_compiled & COMPILED)) {
                                        gl->flg_mem_alocated_compiled |= COMPILED;
-                                       xregcomp(&(gl->preg), gl->pattern, reflags);
+#if !ENABLE_EXTRA_COMPAT
+                                       xregcomp(&gl->compiled_regex, gl->pattern, reflags);
+#else
+                                       memset(&gl->compiled_regex, 0, sizeof(gl->compiled_regex));
+                                       if (re_compile_pattern(gl->pattern, strlen(gl->pattern), &gl->compiled_regex))
+                                               bb_error_msg_and_die("bad regex '%s'", gl->pattern);
+#endif
                                }
-                               regmatch.rm_so = 0;
-                               regmatch.rm_eo = 0;
-                               if (regexec(&(gl->preg), line, 1, &regmatch, 0) == 0) {
+#if !ENABLE_EXTRA_COMPAT
+                               gl->matched_range.rm_so = 0;
+                               gl->matched_range.rm_eo = 0;
+#endif
+                               if (
+#if !ENABLE_EXTRA_COMPAT
+                                       regexec(&gl->compiled_regex, line, 1, &gl->matched_range, 0) == 0
+#else
+                                       re_search(&gl->compiled_regex, line, line_len,
+                                                       /*start:*/ 0, /*range:*/ line_len,
+                                                       &gl->matched_range) >= 0
+#endif
+                               ) {
                                        if (!(option_mask32 & OPT_w))
                                                found = 1;
                                        else {
                                                char c = ' ';
-                                               if (regmatch.rm_so)
-                                                       c = line[regmatch.rm_so - 1];
+                                               if (gl->matched_range.rm_so)
+                                                       c = line[gl->matched_range.rm_so - 1];
                                                if (!isalnum(c) && c != '_') {
-                                                       c = line[regmatch.rm_eo];
+                                                       c = line[gl->matched_range.rm_eo];
                                                        if (!c || (!isalnum(c) && c != '_'))
                                                                found = 1;
                                                }
@@ -261,7 +329,7 @@ static int grep_file(FILE *file)
 
                                        /* now print each line in the buffer, clearing them as we go */
                                        while (before_buf[idx] != NULL) {
-                                               print_line(before_buf[idx], first_buf_entry_line_num, '-');
+                                               print_line(before_buf[idx], before_buf_size[idx], first_buf_entry_line_num, '-');
                                                free(before_buf[idx]);
                                                before_buf[idx] = NULL;
                                                idx = (idx + 1) % lines_before;
@@ -277,13 +345,15 @@ static int grep_file(FILE *file)
                                                /* -Fo just prints the pattern
                                                 * (unless -v: -Fov doesnt print anything at all) */
                                                if (found)
-                                                       print_line(gl->pattern, linenum, ':');
+                                                       print_line(gl->pattern, strlen(gl->pattern), linenum, ':');
                                        } else {
-                                               line[regmatch.rm_eo] = '\0';
-                                               print_line(line + regmatch.rm_so, linenum, ':');
+                                               line[gl->matched_range.rm_eo] = '\0';
+                                               print_line(line + gl->matched_range.rm_so,
+                                                               gl->matched_range.rm_eo - gl->matched_range.rm_so,
+                                                               linenum, ':');
                                        }
                                } else {
-                                       print_line(line, linenum, ':');
+                                       print_line(line, line_len, linenum, ':');
                                }
                        }
                }
@@ -291,12 +361,13 @@ static int grep_file(FILE *file)
                else { /* no match */
                        /* if we need to print some context lines after the last match, do so */
                        if (print_n_lines_after) {
-                               print_line(line, linenum, '-');
+                               print_line(line, strlen(line), linenum, '-');
                                print_n_lines_after--;
                        } else if (lines_before) {
                                /* Add the line to the circular 'before' buffer */
                                free(before_buf[curpos]);
                                before_buf[curpos] = line;
+                               USE_EXTRA_COMPAT(before_buf_size[curpos] = line_len;)
                                curpos = (curpos + 1) % lines_before;
                                /* avoid free(line) - we took the line */
                                line = NULL;
@@ -304,13 +375,14 @@ static int grep_file(FILE *file)
                }
 
 #endif /* ENABLE_FEATURE_GREP_CONTEXT */
+#if !ENABLE_EXTRA_COMPAT
                free(line);
-
+#endif
                /* Did we print all context after last requested match? */
                if ((option_mask32 & OPT_m)
                 && !print_n_lines_after && nmatches == max_matches)
                        break;
-       }
+       } /* while (read line) */
 
        /* special-case file post-processing for options where we don't print line
         * matches, just filenames and possibly match counts */
@@ -428,15 +500,16 @@ int grep_main(int argc, char **argv)
                        lines_after = Copt;
                if (!(option_mask32 & OPT_B)) /* not overridden */
                        lines_before = Copt;
-               //option_mask32 |= OPT_A|OPT_B; /* for parser */
        }
        /* sanity checks */
        if (option_mask32 & (OPT_c|OPT_q|OPT_l|OPT_L)) {
                option_mask32 &= ~OPT_n;
                lines_before = 0;
                lines_after = 0;
-       } else if (lines_before > 0)
-               before_buf = xzalloc(lines_before * sizeof(char *));
+       } else if (lines_before > 0) {
+               before_buf = xzalloc(lines_before * sizeof(before_buf[0]));
+               USE_EXTRA_COMPAT(before_buf_size = xzalloc(lines_before * sizeof(before_buf_size[0]));)
+       }
 #else
        /* with auto sanity checks */
        /* -H unsets -h; -c,-q or -l unset -n; -e,-f are lists; -m N */
@@ -537,7 +610,7 @@ int grep_main(int argc, char **argv)
                        if (gl->flg_mem_alocated_compiled & ALLOCATED)
                                free(gl->pattern);
                        if (gl->flg_mem_alocated_compiled & COMPILED)
-                               regfree(&(gl->preg));
+                               regfree(&gl->compiled_regex);
                        free(gl);
                        free(pattern_head_ptr);
                }
index 56761f9..968d757 100644 (file)
@@ -9,6 +9,10 @@
  * Licensed under GPLv2 or later, see file LICENSE in this tarball for details.
  */
 
+/* for getline() [GNUism] */
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE 1
+#endif
 #include "libbb.h"
 
 /* This function reads an entire line from a text file, up to a newline
@@ -55,7 +59,6 @@ char* FAST_FUNC xmalloc_fgets(FILE *file)
 
        return bb_get_chunk_from_file(file, &i);
 }
-
 /* Get line.  Remove trailing \n */
 char* FAST_FUNC xmalloc_fgetline(FILE *file)
 {
@@ -69,6 +72,44 @@ char* FAST_FUNC xmalloc_fgetline(FILE *file)
 }
 
 #if 0
+
+/* GNUism getline() should be faster (not tested) than a loop with fgetc */
+
+/* Get line, including trailing \n if any */
+char* FAST_FUNC xmalloc_fgets(FILE *file)
+{
+       char *res_buf = NULL;
+       size_t res_sz;
+
+       if (getline(&res_buf, &res_sz, file) == -1) {
+               free(res_buf); /* uclibc allocates a buffer even on EOF. WTF? */
+               res_buf = NULL;
+       }
+//TODO: trimming to res_sz?
+       return res_buf;
+}
+/* Get line.  Remove trailing \n */
+char* FAST_FUNC xmalloc_fgetline(FILE *file)
+{
+       char *res_buf = NULL;
+       size_t res_sz;
+
+       res_sz = getline(&res_buf, &res_sz, file);
+
+       if ((ssize_t)res_sz != -1) {
+               if (res_buf[res_sz - 1] == '\n')
+                       res_buf[--res_sz] = '\0';
+//TODO: trimming to res_sz?
+       } else {
+               free(res_buf); /* uclibc allocates a buffer even on EOF. WTF? */
+               res_buf = NULL;
+       }
+       return res_buf;
+}
+
+#endif
+
+#if 0
 /* Faster routines (~twice as fast). +170 bytes. Unused as of 2008-07.
  *
  * NB: they stop at NUL byte too.
index abfa35f..61efb5b 100644 (file)
@@ -27,6 +27,6 @@ void FAST_FUNC xregcomp(regex_t *preg, const char *regex, int cflags)
 {
        char *errmsg = regcomp_or_errmsg(preg, regex, cflags);
        if (errmsg) {
-               bb_error_msg_and_die("xregcomp: %s", errmsg);
+               bb_error_msg_and_die("bad regex '%s': %s", regex, errmsg);
        }
 }
index b2de2af..8cee1b9 100755 (executable)
@@ -62,12 +62,8 @@ testing "grep -s nofile - (stdin and nonexisting file, match)" \
        "grep -s domatch nonexistent - ; echo \$?" \
        "(standard input):domatch\n2\n" "" "nomatch\ndomatch\nend\n"
 
-# This doesn't match GNU behaviour (Binary file input matches)
-# acts like GNU grep -a
-testing "grep handles binary files" "grep foo input" "foo\n" "\0foo\n\n" ""
-# This doesn't match GNU behaviour (Binary file (standard input) matches)
-# acts like GNU grep -a
-testing "grep handles binary stdin" "grep foo" "foo\n" "" "\0foo\n\n"
+testing "grep handles NUL in files" "grep -a foo input" "\0foo\n" "\0foo\n\n" ""
+testing "grep handles NUL on stdin" "grep -a foo" "\0foo\n" "" "\0foo\n\n"
 
 testing "grep matches NUL" "grep . input > /dev/null 2>&1 ; echo \$?" \
        "0\n" "\0\n" ""