Bump to version 1.22.1
[platform/upstream/busybox.git] / coreutils / wc.c
1 /* vi: set sw=4 ts=4: */
2 /*
3  * wc implementation for busybox
4  *
5  * Copyright (C) 2003  Manuel Novoa III  <mjn3@codepoet.org>
6  *
7  * Licensed under GPLv2 or later, see file LICENSE in this source tree.
8  */
9
10 /* BB_AUDIT SUSv3 compliant. */
11 /* http://www.opengroup.org/onlinepubs/007904975/utilities/wc.html */
12
13 /* Mar 16, 2003      Manuel Novoa III   (mjn3@codepoet.org)
14  *
15  * Rewritten to fix a number of problems and do some size optimizations.
16  * Problems in the previous busybox implementation (besides bloat) included:
17  *  1) broken 'wc -c' optimization (read note below)
18  *  2) broken handling of '-' args
19  *  3) no checking of ferror on EOF returns
20  *  4) isprint() wasn't considered when word counting.
21  *
22  * NOTES:
23  *
24  * The previous busybox wc attempted an optimization using stat for the
25  * case of counting chars only.  I omitted that because it was broken.
26  * It didn't take into account the possibility of input coming from a
27  * pipe, or input from a file with file pointer not at the beginning.
28  *
29  * To implement such a speed optimization correctly, not only do you
30  * need the size, but also the file position.  Note also that the
31  * file position may be past the end of file.  Consider the example
32  * (adapted from example in gnu wc.c)
33  *
34  *      echo hello > /tmp/testfile &&
35  *      (dd ibs=1k skip=1 count=0 &> /dev/null; wc -c) < /tmp/testfile
36  *
37  * for which 'wc -c' should output '0'.
38  */
39 #include "libbb.h"
40 #include "unicode.h"
41
42 #if !ENABLE_LOCALE_SUPPORT
43 # undef isprint
44 # undef isspace
45 # define isprint(c) ((unsigned)((c) - 0x20) <= (0x7e - 0x20))
46 # define isspace(c) ((c) == ' ')
47 #endif
48
49 #if ENABLE_FEATURE_WC_LARGE
50 # define COUNT_T unsigned long long
51 # define COUNT_FMT "llu"
52 #else
53 # define COUNT_T unsigned
54 # define COUNT_FMT "u"
55 #endif
56
57 /* We support -m even when UNICODE_SUPPORT is off,
58  * we just don't advertise it in help text,
59  * since it is the same as -c in this case.
60  */
61
62 //usage:#define wc_trivial_usage
63 //usage:       "[-c"IF_UNICODE_SUPPORT("m")"lwL] [FILE]..."
64 //usage:
65 //usage:#define wc_full_usage "\n\n"
66 //usage:       "Count lines, words, and bytes for each FILE (or stdin)\n"
67 //usage:     "\n        -c      Count bytes"
68 //usage:        IF_UNICODE_SUPPORT(
69 //usage:     "\n        -m      Count characters"
70 //usage:        )
71 //usage:     "\n        -l      Count newlines"
72 //usage:     "\n        -w      Count words"
73 //usage:     "\n        -L      Print longest line length"
74 //usage:
75 //usage:#define wc_example_usage
76 //usage:       "$ wc /etc/passwd\n"
77 //usage:       "     31      46    1365 /etc/passwd\n"
78
79 /* Order is important if we want to be compatible with
80  * column order in "wc -cmlwL" output:
81  */
82 enum {
83         WC_LINES    = 0, /* -l */
84         WC_WORDS    = 1, /* -w */
85         WC_UNICHARS = 2, /* -m */
86         WC_BYTES    = 3, /* -c */
87         WC_LENGTH   = 4, /* -L */
88         NUM_WCS     = 5,
89 };
90
91 int wc_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE;
92 int wc_main(int argc UNUSED_PARAM, char **argv)
93 {
94         const char *arg;
95         const char *start_fmt = " %9"COUNT_FMT + 1;
96         const char *fname_fmt = " %s\n";
97         COUNT_T *pcounts;
98         COUNT_T counts[NUM_WCS];
99         COUNT_T totals[NUM_WCS];
100         int num_files;
101         smallint status = EXIT_SUCCESS;
102         unsigned print_type;
103
104         init_unicode();
105
106         print_type = getopt32(argv, "lwmcL");
107
108         if (print_type == 0) {
109                 print_type = (1 << WC_LINES) | (1 << WC_WORDS) | (1 << WC_BYTES);
110         }
111
112         argv += optind;
113         if (!argv[0]) {
114                 *--argv = (char *) bb_msg_standard_input;
115                 fname_fmt = "\n";
116         }
117         if (!argv[1]) { /* zero or one filename? */
118                 if (!((print_type-1) & print_type)) /* exactly one option? */
119                         start_fmt = "%"COUNT_FMT;
120         }
121
122         memset(totals, 0, sizeof(totals));
123
124         pcounts = counts;
125
126         num_files = 0;
127         while ((arg = *argv++) != NULL) {
128                 FILE *fp;
129                 const char *s;
130                 unsigned u;
131                 unsigned linepos;
132                 smallint in_word;
133
134                 ++num_files;
135                 fp = fopen_or_warn_stdin(arg);
136                 if (!fp) {
137                         status = EXIT_FAILURE;
138                         continue;
139                 }
140
141                 memset(counts, 0, sizeof(counts));
142                 linepos = 0;
143                 in_word = 0;
144
145                 while (1) {
146                         int c;
147                         /* Our -w doesn't match GNU wc exactly... oh well */
148
149                         c = getc(fp);
150                         if (c == EOF) {
151                                 if (ferror(fp)) {
152                                         bb_simple_perror_msg(arg);
153                                         status = EXIT_FAILURE;
154                                 }
155                                 goto DO_EOF;  /* Treat an EOF as '\r'. */
156                         }
157
158                         /* Cater for -c and -m */
159                         ++counts[WC_BYTES];
160                         if (unicode_status != UNICODE_ON /* every byte is a new char */
161                          || (c & 0xc0) != 0x80 /* it isn't a 2nd+ byte of a Unicode char */
162                         ) {
163                                 ++counts[WC_UNICHARS];
164                         }
165
166                         if (isprint_asciionly(c)) { /* FIXME: not unicode-aware */
167                                 ++linepos;
168                                 if (!isspace(c)) {
169                                         in_word = 1;
170                                         continue;
171                                 }
172                         } else if ((unsigned)(c - 9) <= 4) {
173                                 /* \t  9
174                                  * \n 10
175                                  * \v 11
176                                  * \f 12
177                                  * \r 13
178                                  */
179                                 if (c == '\t') {
180                                         linepos = (linepos | 7) + 1;
181                                 } else {  /* '\n', '\r', '\f', or '\v' */
182  DO_EOF:
183                                         if (linepos > counts[WC_LENGTH]) {
184                                                 counts[WC_LENGTH] = linepos;
185                                         }
186                                         if (c == '\n') {
187                                                 ++counts[WC_LINES];
188                                         }
189                                         if (c != '\v') {
190                                                 linepos = 0;
191                                         }
192                                 }
193                         } else {
194                                 continue;
195                         }
196
197                         counts[WC_WORDS] += in_word;
198                         in_word = 0;
199                         if (c == EOF) {
200                                 break;
201                         }
202                 }
203
204                 fclose_if_not_stdin(fp);
205
206                 if (totals[WC_LENGTH] < counts[WC_LENGTH]) {
207                         totals[WC_LENGTH] = counts[WC_LENGTH];
208                 }
209                 totals[WC_LENGTH] -= counts[WC_LENGTH];
210
211  OUTPUT:
212                 /* coreutils wc tries hard to print pretty columns
213                  * (saves results for all files, finds max col len etc...)
214                  * we won't try that hard, it will bloat us too much */
215                 s = start_fmt;
216                 u = 0;
217                 do {
218                         if (print_type & (1 << u)) {
219                                 printf(s, pcounts[u]);
220                                 s = " %9"COUNT_FMT; /* Ok... restore the leading space. */
221                         }
222                         totals[u] += pcounts[u];
223                 } while (++u < NUM_WCS);
224                 printf(fname_fmt, arg);
225         }
226
227         /* If more than one file was processed, we want the totals.  To save some
228          * space, we set the pcounts ptr to the totals array.  This has the side
229          * effect of trashing the totals array after outputting it, but that's
230          * irrelavent since we no longer need it. */
231         if (num_files > 1) {
232                 num_files = 0;  /* Make sure we don't get here again. */
233                 arg = "total";
234                 pcounts = totals;
235                 --argv;
236                 goto OUTPUT;
237         }
238
239         fflush_stdout_and_exit(status);
240 }