1 /* join - join lines of two files on a common field
2 Copyright (C) 91, 1995-2006 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software Foundation,
16 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 Written by Mike Haertel, mike@gnu.ai.mit.edu. */
23 #include <sys/types.h>
28 #include "hard-locale.h"
29 #include "linebuffer.h"
30 #include "memcasecmp.h"
36 /* The official name of this program (e.g., no `g' prefix). */
37 #define PROGRAM_NAME "join"
39 #define AUTHORS "Mike Haertel"
41 #define join system_join
43 /* An element of the list identifying which fields to print for each
47 /* File number: 0, 1, or 2. 0 means use the join field.
48 1 means use the first file argument, 2 the second. */
51 /* Field index (zero-based), specified only when FILE is 1 or 2. */
57 /* A field of a line. */
60 char *beg; /* First character in field. */
61 size_t len; /* The length of the field. */
64 /* A line read from an input file. */
67 struct linebuffer buf; /* The line itself. */
68 size_t nfields; /* Number of elements in `fields'. */
69 size_t nfields_allocated; /* Number of elements allocated for `fields'. */
73 /* One or more consecutive lines read from a file that all have the
74 same join field value. */
77 size_t count; /* Elements used in `lines'. */
78 size_t alloc; /* Elements allocated in `lines'. */
82 /* The name this program was run with. */
85 /* True if the LC_COLLATE locale is hard. */
86 static bool hard_LC_COLLATE;
88 /* If nonzero, print unpairable lines in file 1 or 2. */
89 static bool print_unpairables_1, print_unpairables_2;
91 /* If nonzero, print pairable lines. */
92 static bool print_pairables;
94 /* Empty output field filler. */
95 static char const *empty_filler;
97 /* Field to join on; SIZE_MAX means they haven't been determined yet. */
98 static size_t join_field_1 = SIZE_MAX;
99 static size_t join_field_2 = SIZE_MAX;
101 /* List of fields to print. */
102 static struct outlist outlist_head;
104 /* Last element in `outlist', where a new element can be added. */
105 static struct outlist *outlist_end = &outlist_head;
107 /* Tab character separating fields. If negative, fields are separated
108 by any nonempty string of blanks, otherwise by exactly one
109 tab character whose value (when cast to unsigned char) equals TAB. */
112 static struct option const longopts[] =
114 {"ignore-case", no_argument, NULL, 'i'},
115 {GETOPT_HELP_OPTION_DECL},
116 {GETOPT_VERSION_OPTION_DECL},
120 /* Used to print non-joining lines */
121 static struct line uni_blank;
123 /* If nonzero, ignore case when comparing join fields. */
124 static bool ignore_case;
129 if (status != EXIT_SUCCESS)
130 fprintf (stderr, _("Try `%s --help' for more information.\n"),
135 Usage: %s [OPTION]... FILE1 FILE2\n\
139 For each pair of input lines with identical join fields, write a line to\n\
140 standard output. The default join field is the first, delimited\n\
141 by whitespace. When FILE1 or FILE2 (not both) is -, read standard input.\n\
143 -a FILENUM print unpairable lines coming from file FILENUM, where\n\
144 FILENUM is 1 or 2, corresponding to FILE1 or FILE2\n\
145 -e EMPTY replace missing input fields with EMPTY\n\
148 -i, --ignore-case ignore differences in case when comparing fields\n\
149 -j FIELD equivalent to `-1 FIELD -2 FIELD'\n\
150 -o FORMAT obey FORMAT while constructing output line\n\
151 -t CHAR use CHAR as input and output field separator\n\
154 -v FILENUM like -a FILENUM, but suppress joined output lines\n\
155 -1 FIELD join on this FIELD of file 1\n\
156 -2 FIELD join on this FIELD of file 2\n\
158 fputs (HELP_OPTION_DESCRIPTION, stdout);
159 fputs (VERSION_OPTION_DESCRIPTION, stdout);
162 Unless -t CHAR is given, leading blanks separate fields and are ignored,\n\
163 else fields are separated by CHAR. Any FIELD is a field number counted\n\
164 from 1. FORMAT is one or more comma or blank separated specifications,\n\
165 each being `FILENUM.FIELD' or `0'. Default FORMAT outputs the join field,\n\
166 the remaining fields from FILE1, the remaining fields from FILE2, all\n\
167 separated by CHAR.\n\
169 Important: FILE1 and FILE2 must be sorted on the join fields.\n\
170 E.g., use `sort -k 1b,1' if `join' has no options.\n\
172 printf (_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT);
177 /* Record a field in LINE, with location FIELD and size LEN. */
180 extract_field (struct line *line, char *field, size_t len)
182 if (line->nfields >= line->nfields_allocated)
184 line->fields = X2NREALLOC (line->fields, &line->nfields_allocated);
186 line->fields[line->nfields].beg = field;
187 line->fields[line->nfields].len = len;
191 /* Fill in the `fields' structure in LINE. */
194 xfields (struct line *line)
196 char *ptr = line->buf.buffer;
197 char const *lim = ptr + line->buf.length - 1;
205 for (; (sep = memchr (ptr, tab, lim - ptr)) != NULL; ptr = sep + 1)
206 extract_field (line, ptr, sep - ptr);
210 /* Skip leading blanks before the first field. */
211 while (isblank (to_uchar (*ptr)))
218 for (sep = ptr + 1; sep != lim && ! isblank (to_uchar (*sep)); sep++)
220 extract_field (line, ptr, sep - ptr);
223 for (ptr = sep + 1; ptr != lim && isblank (to_uchar (*ptr)); ptr++)
229 extract_field (line, ptr, lim - ptr);
232 /* Read a line from FP into LINE and split it into fields.
233 Return true if successful. */
236 get_line (FILE *fp, struct line *line)
238 initbuffer (&line->buf);
240 if (! readlinebuffer (&line->buf, fp))
243 error (EXIT_FAILURE, errno, _("read error"));
244 free (line->buf.buffer);
245 line->buf.buffer = NULL;
249 line->nfields_allocated = 0;
257 freeline (struct line *line)
260 free (line->buf.buffer);
261 line->buf.buffer = NULL;
265 initseq (struct seq *seq)
272 /* Read a line from FP and add it to SEQ. Return true if successful. */
275 getseq (FILE *fp, struct seq *seq)
277 if (seq->count == seq->alloc)
278 seq->lines = X2NREALLOC (seq->lines, &seq->alloc);
280 if (get_line (fp, &seq->lines[seq->count]))
289 delseq (struct seq *seq)
292 for (i = 0; i < seq->count; i++)
293 if (seq->lines[i].buf.buffer)
294 freeline (&seq->lines[i]);
298 /* Return <0 if the join field in LINE1 compares less than the one in LINE2;
299 >0 if it compares greater; 0 if it compares equal.
300 Report an error and exit if the comparison fails. */
303 keycmp (struct line const *line1, struct line const *line2)
305 /* Start of field to compare in each file. */
310 size_t len2; /* Length of fields to compare. */
313 if (join_field_1 < line1->nfields)
315 beg1 = line1->fields[join_field_1].beg;
316 len1 = line1->fields[join_field_1].len;
324 if (join_field_2 < line2->nfields)
326 beg2 = line2->fields[join_field_2].beg;
327 len2 = line2->fields[join_field_2].len;
336 return len2 == 0 ? 0 : -1;
342 /* FIXME: ignore_case does not work with NLS (in particular,
343 with multibyte chars). */
344 diff = memcasecmp (beg1, beg2, MIN (len1, len2));
349 return xmemcoll (beg1, len1, beg2, len2);
350 diff = memcmp (beg1, beg2, MIN (len1, len2));
355 return len1 < len2 ? -1 : len1 != len2;
358 /* Print field N of LINE if it exists and is nonempty, otherwise
359 `empty_filler' if it is nonempty. */
362 prfield (size_t n, struct line const *line)
366 if (n < line->nfields)
368 len = line->fields[n].len;
370 fwrite (line->fields[n].beg, 1, len, stdout);
371 else if (empty_filler)
372 fputs (empty_filler, stdout);
374 else if (empty_filler)
375 fputs (empty_filler, stdout);
378 /* Print the join of LINE1 and LINE2. */
381 prjoin (struct line const *line1, struct line const *line2)
383 const struct outlist *outlist;
384 char output_separator = tab < 0 ? ' ' : tab;
386 outlist = outlist_head.next;
389 const struct outlist *o;
395 struct line const *line;
399 if (line1 == &uni_blank)
402 field = join_field_2;
407 field = join_field_1;
412 line = (o->file == 1 ? line1 : line2);
415 prfield (field, line);
419 putchar (output_separator);
427 if (line1 == &uni_blank)
429 struct line const *t;
434 prfield (join_field_1, line1);
435 for (i = 0; i < join_field_1 && i < line1->nfields; ++i)
437 putchar (output_separator);
440 for (i = join_field_1 + 1; i < line1->nfields; ++i)
442 putchar (output_separator);
446 for (i = 0; i < join_field_2 && i < line2->nfields; ++i)
448 putchar (output_separator);
451 for (i = join_field_2 + 1; i < line2->nfields; ++i)
453 putchar (output_separator);
460 /* Print the join of the files in FP1 and FP2. */
463 join (FILE *fp1, FILE *fp2)
465 struct seq seq1, seq2;
470 /* Read the first line of each file. */
476 while (seq1.count && seq2.count)
479 diff = keycmp (&seq1.lines[0], &seq2.lines[0]);
482 if (print_unpairables_1)
483 prjoin (&seq1.lines[0], &uni_blank);
484 freeline (&seq1.lines[0]);
491 if (print_unpairables_2)
492 prjoin (&uni_blank, &seq2.lines[0]);
493 freeline (&seq2.lines[0]);
499 /* Keep reading lines from file1 as long as they continue to
500 match the current line from file2. */
503 if (!getseq (fp1, &seq1))
509 while (!keycmp (&seq1.lines[seq1.count - 1], &seq2.lines[0]));
511 /* Keep reading lines from file2 as long as they continue to
512 match the current line from file1. */
515 if (!getseq (fp2, &seq2))
521 while (!keycmp (&seq1.lines[0], &seq2.lines[seq2.count - 1]));
525 for (i = 0; i < seq1.count - 1; ++i)
528 for (j = 0; j < seq2.count - 1; ++j)
529 prjoin (&seq1.lines[i], &seq2.lines[j]);
533 for (i = 0; i < seq1.count - 1; ++i)
534 freeline (&seq1.lines[i]);
537 seq1.lines[0] = seq1.lines[seq1.count - 1];
543 for (i = 0; i < seq2.count - 1; ++i)
544 freeline (&seq2.lines[i]);
547 seq2.lines[0] = seq2.lines[seq2.count - 1];
554 if (print_unpairables_1 && seq1.count)
556 prjoin (&seq1.lines[0], &uni_blank);
557 freeline (&seq1.lines[0]);
558 while (get_line (fp1, &line))
560 prjoin (&line, &uni_blank);
565 if (print_unpairables_2 && seq2.count)
567 prjoin (&uni_blank, &seq2.lines[0]);
568 freeline (&seq2.lines[0]);
569 while (get_line (fp2, &line))
571 prjoin (&uni_blank, &line);
580 /* Add a field spec for field FIELD of file FILE to `outlist'. */
583 add_field (int file, size_t field)
587 assert (file == 0 || file == 1 || file == 2);
588 assert (file != 0 || field == 0);
590 o = xmalloc (sizeof *o);
595 /* Add to the end of the list so the fields are in the right order. */
596 outlist_end->next = o;
600 /* Convert a string of decimal digits, STR (the 1-based join field number),
601 to an integral value. Upon successful conversion, return one less
602 (the zero-based field number). If it cannot be converted, give a
603 diagnostic and exit. */
606 string_to_join_field (char const *str)
609 unsigned long int val;
611 strtol_error s_err = xstrtoul (str, NULL, 10, &val, "");
612 if (s_err == LONGINT_OVERFLOW || (s_err == LONGINT_OK && SIZE_MAX < val))
614 error (EXIT_FAILURE, 0,
615 _("value %s is so large that it is not representable"),
619 if (s_err != LONGINT_OK || val == 0)
620 error (EXIT_FAILURE, 0, _("invalid field number: %s"), quote (str));
627 /* Convert a single field specifier string, S, to a *FILE_INDEX, *FIELD_INDEX
628 pair. In S, the field index string is 1-based; *FIELD_INDEX is zero-based.
629 If S is valid, return true. Otherwise, give a diagnostic and exit. */
632 decode_field_spec (const char *s, int *file_index, size_t *field_index)
634 /* The first character must be 0, 1, or 2. */
640 /* `0' must be all alone -- no `.FIELD'. */
641 error (EXIT_FAILURE, 0, _("invalid field specifier: %s"), quote (s));
650 error (EXIT_FAILURE, 0, _("invalid field specifier: %s"), quote (s));
651 *file_index = s[0] - '0';
652 *field_index = string_to_join_field (s + 2);
656 error (EXIT_FAILURE, 0,
657 _("invalid file number in field spec: %s"), quote (s));
659 /* Tell gcc -W -Wall that we can't get beyond this point.
660 This avoids a warning (otherwise legit) that the caller's copies
661 of *file_index and *field_index might be used uninitialized. */
668 /* Add the comma or blank separated field spec(s) in STR to `outlist'. */
671 add_field_list (char *str)
679 char const *spec_item = p;
681 p = strpbrk (p, ", \t");
684 decode_field_spec (spec_item, &file_index, &field_index);
685 add_field (file_index, field_index);
690 /* Set the join field *VAR to VAL, but report an error if *VAR is set
691 more than once to incompatible values. */
694 set_join_field (size_t *var, size_t val)
696 if (*var != SIZE_MAX && *var != val)
698 unsigned long int var1 = *var + 1;
699 unsigned long int val1 = val + 1;
700 error (EXIT_FAILURE, 0, _("incompatible join fields %lu, %lu"),
706 /* Status of command-line arguments. */
710 /* This argument must be an operand, i.e., one of the files to be
714 /* This might be the argument of the preceding -j1 or -j2 option,
715 or it might be an operand. */
719 /* This might be the argument of the preceding -o option, or it might be
724 /* Add NAME to the array of input file NAMES with operand statuses
725 OPERAND_STATUS; currently there are NFILES names in the list. */
728 add_file_name (char *name, char *names[2],
729 int operand_status[2], int joption_count[2], int *nfiles,
730 int *prev_optc_status, int *optc_status)
736 bool op0 = (operand_status[0] == MUST_BE_OPERAND);
737 char *arg = names[op0];
738 switch (operand_status[op0])
740 case MUST_BE_OPERAND:
741 error (0, 0, _("extra operand %s"), quote (name));
742 usage (EXIT_FAILURE);
744 case MIGHT_BE_J1_ARG:
746 set_join_field (&join_field_1, string_to_join_field (arg));
749 case MIGHT_BE_J2_ARG:
751 set_join_field (&join_field_2, string_to_join_field (arg));
755 add_field_list (arg);
760 operand_status[0] = operand_status[1];
766 operand_status[n] = *prev_optc_status;
769 if (*prev_optc_status == MIGHT_BE_O_ARG)
770 *optc_status = MIGHT_BE_O_ARG;
774 main (int argc, char **argv)
777 int prev_optc_status = MUST_BE_OPERAND;
778 int operand_status[2];
779 int joption_count[2] = { 0, 0 };
786 initialize_main (&argc, &argv);
787 program_name = argv[0];
788 setlocale (LC_ALL, "");
789 bindtextdomain (PACKAGE, LOCALEDIR);
790 textdomain (PACKAGE);
791 hard_LC_COLLATE = hard_locale (LC_COLLATE);
793 atexit (close_stdout);
795 print_pairables = true;
797 while ((optc = getopt_long (argc, argv, "-a:e:i1:2:j:o:t:v:",
801 optc_status = MUST_BE_OPERAND;
806 print_pairables = false;
811 unsigned long int val;
812 if (xstrtoul (optarg, NULL, 10, &val, "") != LONGINT_OK
813 || (val != 1 && val != 2))
814 error (EXIT_FAILURE, 0,
815 _("invalid field number: %s"), quote (optarg));
817 print_unpairables_1 = true;
819 print_unpairables_2 = true;
824 if (empty_filler && ! STREQ (empty_filler, optarg))
825 error (EXIT_FAILURE, 0,
826 _("conflicting empty-field replacement strings"));
827 empty_filler = optarg;
835 set_join_field (&join_field_1, string_to_join_field (optarg));
839 set_join_field (&join_field_2, string_to_join_field (optarg));
843 if ((optarg[0] == '1' || optarg[0] == '2') && !optarg[1]
844 && optarg == argv[optind - 1] + 2)
846 /* The argument was either "-j1" or "-j2". */
847 bool is_j2 = (optarg[0] == '2');
848 joption_count[is_j2]++;
849 optc_status = MIGHT_BE_J1_ARG + is_j2;
853 set_join_field (&join_field_1, string_to_join_field (optarg));
854 set_join_field (&join_field_2, join_field_1);
859 add_field_list (optarg);
860 optc_status = MIGHT_BE_O_ARG;
865 unsigned char newtab = optarg[0];
867 error (EXIT_FAILURE, 0, _("empty tab"));
870 if (STREQ (optarg, "\\0"))
873 error (EXIT_FAILURE, 0, _("multi-character tab %s"),
876 if (0 <= tab && tab != newtab)
877 error (EXIT_FAILURE, 0, _("incompatible tabs"));
882 case 1: /* Non-option argument. */
883 add_file_name (optarg, names, operand_status, joption_count,
884 &nfiles, &prev_optc_status, &optc_status);
887 case_GETOPT_HELP_CHAR;
889 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
892 usage (EXIT_FAILURE);
895 prev_optc_status = optc_status;
898 /* Process any operands after "--". */
899 prev_optc_status = MUST_BE_OPERAND;
900 while (optind < argc)
901 add_file_name (argv[optind++], names, operand_status, joption_count,
902 &nfiles, &prev_optc_status, &optc_status);
907 error (0, 0, _("missing operand"));
909 error (0, 0, _("missing operand after %s"), quote (argv[argc - 1]));
910 usage (EXIT_FAILURE);
913 /* If "-j1" was specified and it turns out not to have had an argument,
914 treat it as "-j 1". Likewise for -j2. */
915 for (i = 0; i < 2; i++)
916 if (joption_count[i] != 0)
918 set_join_field (&join_field_1, i);
919 set_join_field (&join_field_2, i);
922 if (join_field_1 == SIZE_MAX)
924 if (join_field_2 == SIZE_MAX)
927 fp1 = STREQ (names[0], "-") ? stdin : fopen (names[0], "r");
929 error (EXIT_FAILURE, errno, "%s", names[0]);
930 fp2 = STREQ (names[1], "-") ? stdin : fopen (names[1], "r");
932 error (EXIT_FAILURE, errno, "%s", names[1]);
934 error (EXIT_FAILURE, errno, _("both files cannot be standard input"));
937 if (fclose (fp1) != 0)
938 error (EXIT_FAILURE, errno, "%s", names[0]);
939 if (fclose (fp2) != 0)
940 error (EXIT_FAILURE, errno, "%s", names[1]);