wc: ignore multibyte-character decoding errors

author James Youngman <jay@gnu.org>

Sat, 26 May 2007 05:08:18 +0000 (07:08 +0200)

committer Jim Meyering <jim@meyering.net>

Sat, 26 May 2007 05:13:50 +0000 (07:13 +0200)
author James Youngman <jay@gnu.org>
Sat, 26 May 2007 05:08:18 +0000 (07:08 +0200)
committer Jim Meyering <jim@meyering.net>
Sat, 26 May 2007 05:13:50 +0000 (07:13 +0200)
diff --git a/ChangeLog b/ChangeLog

index 34304f1..ab44638 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,12 @@
+2007-05-25  James Youngman  <jay@gnu.org>
+
+       wc: ignore multibyte-character decoding errors
+       * src/wc.c (wc): Don't issue an error message when mbrtowc
+       indicates that we have seen an invalid byte sequence.  This
+       makes "wc /bin/sh" bearable (though the word and line counts
+       are likely not to be useful).
+       * NEWS: Mention the change.
+
  2007-05-22  Jim Meyering  <jim@meyering.net>
  
         Check for an up-to-date copyright year in coreutils.texi.
diff --git a/NEWS b/NEWS

index ea08e0a..715b0d6 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -10,6 +10,11 @@ GNU coreutils NEWS                                    -*- outline -*-
    option of the same name, this makes uniq consume and produce
    NUL-terminated lines rather than newline-terminated lines.
  
+  wc no longer warns about character decoding errors in multibyte locales.
+  This means for example that "wc /bin/sh" now produces normal output
+  (though the word count will have no real meaning) rather than many
+  error messages.
+
  ** Bug fixes
  
    cut now diagnoses a range starting with zero (e.g., -f 0-2) as invalid;
diff --git a/src/wc.c b/src/wc.c

index 85f7d33..b4464d2 100644 (file)
--- a/src/wc.c
+++ b/src/wc.c
@@ -1,5 +1,5 @@
  /* wc - print the number of lines, words, and bytes in files
-   Copyright (C) 85, 91, 1995-2006 Free Software Foundation, Inc.
+   Copyright (C) 85, 91, 1995-2007 Free Software Foundation, Inc.
  
     This program is free software; you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -274,8 +274,6 @@ wc (int fd, char const *file_x, struct fstatus *fstatus)
        bool in_word = false;
        uintmax_t linepos = 0;
        mbstate_t state = { 0, };
-      uintmax_t last_error_line = 0;
-      int last_error_errno = 0;
  # if SUPPORT_OLD_MBRTOWC
        /* Back-up the state before each multibyte character conversion and
          move the last incomplete character of the buffer to the front
@@ -323,17 +321,10 @@ wc (int fd, char const *file_x, struct fstatus *fstatus)
                 }
               if (n == (size_t) -1)
                 {
-                 /* Signal repeated errors only once per line.  */
-                 if (!(lines + 1 == last_error_line
-                       && errno == last_error_errno))
-                   {
-                     char line_number_buf[INT_BUFSIZE_BOUND (uintmax_t)];
-                     last_error_line = lines + 1;
-                     last_error_errno = errno;
-                     error (0, errno, "%s:%s", file,
-                            umaxtostr (last_error_line, line_number_buf));
-                     ok = false;
-                   }
+                 /* Remember that we read a byte, but don't complain
+                    about the error.  Because of the decoding error,
+                    this is a considered to be byte but not a
+                    character (that is, chars is not incremented).  */
                   p++;
                   bytes_read--;
                 }
author	James Youngman <jay@gnu.org>
	Sat, 26 May 2007 05:08:18 +0000 (07:08 +0200)
committer	Jim Meyering <jim@meyering.net>
	Sat, 26 May 2007 05:13:50 +0000 (07:13 +0200)
ChangeLog		patch \| blob \| history
NEWS		patch \| blob \| history
src/wc.c		patch \| blob \| history