From: James Youngman Date: Tue, 19 Feb 2008 13:13:00 +0000 (+0100) Subject: join: new options: --check-order and --nocheck-order. X-Git-Tag: v6.11~131 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=a1e715698a038af7ff341011a2aeecf6729c8de9;p=platform%2Fupstream%2Fcoreutils.git join: new options: --check-order and --nocheck-order. * src/join.c: Support --check-order and --nocheck-order. New variables check_input_order, seen_unpairable and issued_disorder_warning[]. For --check-order, verify that the input files are in sorted order. For the default case, check the order only if there are unpairable lines. (join): Perform ordering checks after reaching EOF on either input. (usage): Mention --check-order and --nocheck-order. (dupline): Save a copy of the previously-read input line so that we can detect disorder on the input. (get_line): Temporarily save a copy of the previous line (by calling dupline) and check relative ordering (by calling checkorder) before returning the newly-read line. (getseq, join): Tell get_line which file we are reading from. (advance_seq): New function, factoring out some of the code commonly surrounding calls to getseq. (checkorder): New function. Verifies that a pair of consecutive input lines are in sorted order. * doc/coreutils.texi (join invocation): Document the new options --check-order and --nocheck-order. * tests/join/Test.pm (tv): Added tests for --check-order and --nocheck-order. * NEWS: Mention this new feature. --- diff --git a/NEWS b/NEWS index bc1b47d..daa5633 100644 --- a/NEWS +++ b/NEWS @@ -16,6 +16,11 @@ GNU coreutils NEWS -*- outline -*- rather than reporting the invalid string format. [bug introduced in coreutils-6.0] +** New features + + join now verifies that the inputs are in sorted order. This check can + be turned off with the --nocheck-order option. + ** Improvements ls --color no longer outputs unnecessary escape sequences diff --git a/doc/coreutils.texi b/doc/coreutils.texi index 016673a..e8ccb4b 100644 --- a/doc/coreutils.texi +++ b/doc/coreutils.texi @@ -5149,10 +5149,10 @@ sort a file on its default join field, but if you select a non-default locale, join field, separator, or comparison options, then you should do so consistently between @command{join} and @command{sort}. -As a @acronym{GNU} extension, if the input has no unpairable lines the -sort order can be any order that considers two fields to be equal if and -only if the sort comparison described above considers them to be equal. -For example: +If the input has no unpairable lines, a @acronym{GNU} extension is +available; the sort order can be any order that considers two fields +to be equal if and only if the sort comparison described above +considers them to be equal. For example: @example $ cat file1 @@ -5169,6 +5169,19 @@ c c1 c2 b b1 b2 @end example +If the @option{--check-order} option is given, unsorted inputs will +cause a fatal error message. If the option @option{--nocheck-order} +is given, unsorted inputs will never cause an error message. If +neither of these options is given, wrongly sorted inputs are diagnosed +only if an input file is found to contain unpairable lines. If an +input file is diagnosed as being unsorted, the @command{join} command +will exit with a nonzero status (and the output should not be used). + +Forcing @command{join} to process wrongly sorted input files +containing unpairable lines by specifying @option{--nocheck-order} is +not guaranteed to produce any particular output. The output will +probably not correspond with whatever you hoped it would be. + The defaults are: @itemize @item the join field is the first field in each line; @@ -5188,6 +5201,12 @@ The program accepts the following options. Also see @ref{Common options}. Print a line for each unpairable line in file @var{file-number} (either @samp{1} or @samp{2}), in addition to the normal output. +@item --check-order +Fail with an error message if either input file is wrongly ordered. + +@item --nocheck-order +Do not check that both input files are in sorted order. This is the default. + @item -e @var{string} @opindex -e Replace those output fields that are missing in the input with diff --git a/src/join.c b/src/join.c index a6ca7e4..71b11da 100644 --- a/src/join.c +++ b/src/join.c @@ -1,5 +1,5 @@ /* join - join lines of two files on a common field - Copyright (C) 91, 1995-2006 Free Software Foundation, Inc. + Copyright (C) 91, 1995-2006, 2008 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -90,6 +90,12 @@ static bool print_unpairables_1, print_unpairables_2; /* If nonzero, print pairable lines. */ static bool print_pairables; +/* If nonzero, we have seen at least one unpairable line. */ +static bool seen_unpairable; + +/* If nonzero, we have warned about disorder in that file. */ +static bool issued_disorder_warning[2]; + /* Empty output field filler. */ static char const *empty_filler; @@ -108,9 +114,26 @@ static struct outlist *outlist_end = &outlist_head; tab character whose value (when cast to unsigned char) equals TAB. */ static int tab = -1; +/* If nonzero, check that the input is correctly ordered. */ +static enum + { + CHECK_ORDER_DEFAULT, + CHECK_ORDER_ENABLED, + CHECK_ORDER_DISABLED + } check_input_order; + +enum +{ + CHECK_ORDER_OPTION = CHAR_MAX + 1, + NOCHECK_ORDER_OPTION +}; + + static struct option const longopts[] = { {"ignore-case", no_argument, NULL, 'i'}, + {"check-order", no_argument, NULL, CHECK_ORDER_OPTION}, + {"nocheck-order", no_argument, NULL, NOCHECK_ORDER_OPTION}, {GETOPT_HELP_OPTION_DECL}, {GETOPT_VERSION_OPTION_DECL}, {NULL, 0, NULL, 0} @@ -122,6 +145,9 @@ static struct line uni_blank; /* If nonzero, ignore case when comparing join fields. */ static bool ignore_case; + +static void checkorder (const struct line *, const struct line *, int); + void usage (int status) { @@ -153,6 +179,9 @@ by whitespace. When FILE1 or FILE2 (not both) is -, read standard input.\n\ -v FILENUM like -a FILENUM, but suppress joined output lines\n\ -1 FIELD join on this FIELD of file 1\n\ -2 FIELD join on this FIELD of file 2\n\ + --check-order check that the input is correctly sorted, even\n\ + if all input lines are pairable\n\ + --nocheck-order do not check that the input is correctly sorted\n\ "), stdout); fputs (HELP_OPTION_DESCRIPTION, stdout); fputs (VERSION_OPTION_DESCRIPTION, stdout); @@ -167,6 +196,8 @@ separated by CHAR.\n\ \n\ Important: FILE1 and FILE2 must be sorted on the join fields.\n\ E.g., use `sort -k 1b,1' if `join' has no options.\n\ +If the input is not sorted and some lines cannot be joined, a\n\ +warning message will be given.\n\ "), stdout); emit_bug_reporting_address (); } @@ -228,12 +259,49 @@ xfields (struct line *line) extract_field (line, ptr, lim - ptr); } +static struct line * +dup_line (const struct line *old) +{ + struct line *newline = xmalloc (sizeof *newline); + size_t i; + + /* Duplicate the buffer. */ + initbuffer (&newline->buf); + newline->buf.buffer = xmalloc (old->buf.size); + newline->buf.size = old->buf.size; + memcpy (newline->buf.buffer, old->buf.buffer, old->buf.length); + newline->buf.length = old->buf.length; + + /* Duplicate the field positions. */ + newline->fields = xnmalloc (old->nfields_allocated, sizeof *newline->fields); + newline->nfields = old->nfields; + newline->nfields_allocated = old->nfields_allocated; + + for (i = 0; i < old->nfields; i++) + { + newline->fields[i].len = old->fields[i].len; + newline->fields[i].beg = newline->buf.buffer + (old->fields[i].beg + - old->buf.buffer); + } + return newline; +} + +static void +freeline (struct line *line) +{ + free (line->fields); + free (line->buf.buffer); + line->buf.buffer = NULL; +} + /* Read a line from FP into LINE and split it into fields. Return true if successful. */ static bool -get_line (FILE *fp, struct line *line) +get_line (FILE *fp, struct line *line, int which) { + static struct line *prevline[2]; + initbuffer (&line->buf); if (! readlinebuffer (&line->buf, fp)) @@ -249,15 +317,14 @@ get_line (FILE *fp, struct line *line) line->nfields = 0; line->fields = NULL; xfields (line); - return true; -} -static void -freeline (struct line *line) -{ - free (line->fields); - free (line->buf.buffer); - line->buf.buffer = NULL; + if (prevline[which - 1]) + { + checkorder (prevline[which - 1], line, which); + freeline (prevline[which - 1]); + } + prevline[which - 1] = dup_line (line); + return true; } static void @@ -271,12 +338,12 @@ initseq (struct seq *seq) /* Read a line from FP and add it to SEQ. Return true if successful. */ static bool -getseq (FILE *fp, struct seq *seq) +getseq (FILE *fp, struct seq *seq, int whichfile) { if (seq->count == seq->alloc) seq->lines = X2NREALLOC (seq->lines, &seq->alloc); - if (get_line (fp, &seq->lines[seq->count])) + if (get_line (fp, &seq->lines[seq->count], whichfile)) { ++seq->count; return true; @@ -284,6 +351,20 @@ getseq (FILE *fp, struct seq *seq) return false; } +/* Read a line from FP and add it to SEQ, as the first item if FIRST is + * true, else as the next. + */ +static bool +advance_seq (FILE *fp, struct seq *seq, bool first, int whichfile) +{ + if (first) + { + freeline (&seq->lines[0]); + seq->count = 0; + } + return getseq (fp, seq, whichfile); +} + static void delseq (struct seq *seq) { @@ -354,6 +435,44 @@ keycmp (struct line const *line1, struct line const *line2) return len1 < len2 ? -1 : len1 != len2; } + + +/* Check that successive input lines PREV and CURRENT from input file + * WHATFILE are presented in order, unless the user may be relying on + * the GNU extension that input lines may be out of order if no input + * lines are unpairable. + * + * If the user specified --nocheck-order, the check is not made. + * If the user specified --check-order, the problem is fatal. + * Otherwise (the default), the message is simply a warning. + * + * A message is printed at most once per input file. + */ +static void +checkorder (const struct line *prev, + const struct line *current, + int whatfile) +{ + if (check_input_order != CHECK_ORDER_DISABLED + && ((check_input_order == CHECK_ORDER_ENABLED) || seen_unpairable)) + { + if (!issued_disorder_warning[whatfile-1]) + { + if (keycmp (prev, current) > 0) + { + error ((check_input_order == CHECK_ORDER_ENABLED ? 1 : 0), + 0, _("File %d is not in sorted order"), whatfile); + + /* If we get to here, the message was just a warning, but we + want only to issue it once. */ + issued_disorder_warning[whatfile-1] = true; + } + } + } +} + + + /* Print field N of LINE if it exists and is nonempty, otherwise `empty_filler' if it is nonempty. */ @@ -464,13 +583,13 @@ join (FILE *fp1, FILE *fp2) struct seq seq1, seq2; struct line line; int diff; - bool eof1, eof2; + bool eof1, eof2, checktail; /* Read the first line of each file. */ initseq (&seq1); - getseq (fp1, &seq1); + getseq (fp1, &seq1, 1); initseq (&seq2); - getseq (fp2, &seq2); + getseq (fp2, &seq2, 2); while (seq1.count && seq2.count) { @@ -480,18 +599,16 @@ join (FILE *fp1, FILE *fp2) { if (print_unpairables_1) prjoin (&seq1.lines[0], &uni_blank); - freeline (&seq1.lines[0]); - seq1.count = 0; - getseq (fp1, &seq1); + advance_seq (fp1, &seq1, true, 1); + seen_unpairable = true; continue; } if (diff > 0) { if (print_unpairables_2) prjoin (&uni_blank, &seq2.lines[0]); - freeline (&seq2.lines[0]); - seq2.count = 0; - getseq (fp2, &seq2); + advance_seq (fp2, &seq2, true, 2); + seen_unpairable = true; continue; } @@ -499,7 +616,7 @@ join (FILE *fp1, FILE *fp2) match the current line from file2. */ eof1 = false; do - if (!getseq (fp1, &seq1)) + if (!advance_seq (fp1, &seq1, false, 1)) { eof1 = true; ++seq1.count; @@ -511,7 +628,7 @@ join (FILE *fp1, FILE *fp2) match the current line from file1. */ eof2 = false; do - if (!getseq (fp2, &seq2)) + if (!advance_seq (fp2, &seq2, false, 2)) { eof2 = true; ++seq2.count; @@ -550,25 +667,46 @@ join (FILE *fp1, FILE *fp2) seq2.count = 0; } - if (print_unpairables_1 && seq1.count) + /* If the user did not specify --check-order, and the we read the + * tail ends of both inputs to verify that they are in order. We + * skip the rest of the tail once we have issued a warning for that + * file, unless we actually need to print the unpairable lines. + */ + if (check_input_order != CHECK_ORDER_DISABLED + && !(issued_disorder_warning[0] && issued_disorder_warning[1])) + checktail = true; + else + checktail = false; + + if ((print_unpairables_1 || checktail) && seq1.count) { - prjoin (&seq1.lines[0], &uni_blank); + if (print_unpairables_1) + prjoin (&seq1.lines[0], &uni_blank); freeline (&seq1.lines[0]); - while (get_line (fp1, &line)) + seen_unpairable = true; + while (get_line (fp1, &line, 1)) { - prjoin (&line, &uni_blank); + if (print_unpairables_1) + prjoin (&line, &uni_blank); freeline (&line); + if (issued_disorder_warning[0] && !print_unpairables_1) + break; } } - if (print_unpairables_2 && seq2.count) + if ((print_unpairables_2 || checktail) && seq2.count) { - prjoin (&uni_blank, &seq2.lines[0]); + if (print_unpairables_2) + prjoin (&uni_blank, &seq2.lines[0]); freeline (&seq2.lines[0]); - while (get_line (fp2, &line)) + seen_unpairable = true; + while (get_line (fp2, &line, 2)) { - prjoin (&uni_blank, &line); + if (print_unpairables_2) + prjoin (&uni_blank, &line); freeline (&line); + if (issued_disorder_warning[1] && !print_unpairables_2) + break; } } @@ -789,6 +927,9 @@ main (int argc, char **argv) atexit (close_stdout); print_pairables = true; + seen_unpairable = false; + issued_disorder_warning[0] = issued_disorder_warning[1] = false; + check_input_order = CHECK_ORDER_DEFAULT; while ((optc = getopt_long (argc, argv, "-a:e:i1:2:j:o:t:v:", longopts, NULL)) @@ -875,6 +1016,14 @@ main (int argc, char **argv) } break; + case NOCHECK_ORDER_OPTION: + check_input_order = CHECK_ORDER_DISABLED; + break; + + case CHECK_ORDER_OPTION: + check_input_order = CHECK_ORDER_ENABLED; + break; + case 1: /* Non-option argument. */ add_file_name (optarg, names, operand_status, joption_count, &nfiles, &prev_optc_status, &optc_status); @@ -935,5 +1084,8 @@ main (int argc, char **argv) if (fclose (fp2) != 0) error (EXIT_FAILURE, errno, "%s", names[1]); - exit (EXIT_SUCCESS); + if (issued_disorder_warning[0] || issued_disorder_warning[1]) + exit (EXIT_FAILURE); + else + exit (EXIT_SUCCESS); } diff --git a/tests/join/Test.pm b/tests/join/Test.pm index 4813604..6d91908 100644 --- a/tests/join/Test.pm +++ b/tests/join/Test.pm @@ -1,6 +1,6 @@ # Test "join". -# Copyright (C) 1996, 1999, 2000, 2003, 2004 Free Software Foundation, Inc. +# Copyright (C) 1996, 1999-2000, 2003-2004, 2008 Free Software Foundation, Inc. # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -140,7 +140,38 @@ my @tv = ( # FIXME: change this to ensure the diagnostic makes sense ['invalid-j', '-j x', {}, "", 1], -); +# With ordering check, inputs in order +['chkodr-1', '--check-order', + [" a 1\n b 2\n", " a Y\n b Z\n"], "a 1 Y\nb 2 Z\n", 0], + +# Without check, inputs in order +['chkodr-2', '--nocheck-order', + [" a 1\n b 2\n", " a Y\n b Z\n"], "a 1 Y\nb 2 Z\n", 0], + +# Without check, both inputs out of order (in fact, in reverse order) +# but all pairable. Support for this is a GNU extension. +['chkodr-3', '--nocheck-order', + [" b 1\n a 2\n", " b Y\n a Z\n"], "b 1 Y\na 2 Z\n", 0], + +# The extension should work without --nocheck-order, since that is the +# default. +['chkodr-4', '', + [" b 1\n a 2\n", " b Y\n a Z\n"], "b 1 Y\na 2 Z\n", 0], + +# With check, both inputs out of order (in fact, in reverse order) +['chkodr-5', '--check-order', + [" b 1\n a 2\n", " b Y\n a Z\n"], "", 1], + +# Without order check, both inputs out of order and some lines +# unpairable. This is NOT supported by the GNU extension. All that +# we really care about for this test is that the return status is +# zero, since that is the only way to actually verify that the +# --nocheck-order option had any effect. We don't actually want to +# guarantee that join produces this output on stdout. +['chkodr-6', '--nocheck-order', + [" b 1\n a 2\n", " b Y\n c Z\n"], "b 1 Y\n", 0] +) +; sub test_vector