1 /* join - join lines of two files on a common field
2 Copyright (C) 91, 1995-2006 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software Foundation,
16 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18 Written by Mike Haertel, mike@gnu.ai.mit.edu. */
23 #include <sys/types.h>
28 #include "hard-locale.h"
29 #include "linebuffer.h"
30 #include "memcasecmp.h"
36 /* The official name of this program (e.g., no `g' prefix). */
37 #define PROGRAM_NAME "join"
39 #define AUTHORS "Mike Haertel"
41 #define join system_join
43 /* An element of the list identifying which fields to print for each
47 /* File number: 0, 1, or 2. 0 means use the join field.
48 1 means use the first file argument, 2 the second. */
51 /* Field index (zero-based), specified only when FILE is 1 or 2. */
57 /* A field of a line. */
60 char *beg; /* First character in field. */
61 size_t len; /* The length of the field. */
64 /* A line read from an input file. */
67 struct linebuffer buf; /* The line itself. */
68 size_t nfields; /* Number of elements in `fields'. */
69 size_t nfields_allocated; /* Number of elements allocated for `fields'. */
73 /* One or more consecutive lines read from a file that all have the
74 same join field value. */
77 size_t count; /* Elements used in `lines'. */
78 size_t alloc; /* Elements allocated in `lines'. */
82 /* The name this program was run with. */
85 /* True if the LC_COLLATE locale is hard. */
86 static bool hard_LC_COLLATE;
88 /* If nonzero, print unpairable lines in file 1 or 2. */
89 static bool print_unpairables_1, print_unpairables_2;
91 /* If nonzero, print pairable lines. */
92 static bool print_pairables;
94 /* Empty output field filler. */
95 static char const *empty_filler;
97 /* Field to join on; SIZE_MAX means they haven't been determined yet. */
98 static size_t join_field_1 = SIZE_MAX;
99 static size_t join_field_2 = SIZE_MAX;
101 /* List of fields to print. */
102 static struct outlist outlist_head;
104 /* Last element in `outlist', where a new element can be added. */
105 static struct outlist *outlist_end = &outlist_head;
107 /* Tab character separating fields. If negative, fields are separated
108 by any nonempty string of blanks, otherwise by exactly one
109 tab character whose value (when cast to unsigned char) equals TAB. */
112 static struct option const longopts[] =
114 {"ignore-case", no_argument, NULL, 'i'},
115 {GETOPT_HELP_OPTION_DECL},
116 {GETOPT_VERSION_OPTION_DECL},
120 /* Used to print non-joining lines */
121 static struct line uni_blank;
123 /* If nonzero, ignore case when comparing join fields. */
124 static bool ignore_case;
129 if (status != EXIT_SUCCESS)
130 fprintf (stderr, _("Try `%s --help' for more information.\n"),
135 Usage: %s [OPTION]... FILE1 FILE2\n\
139 For each pair of input lines with identical join fields, write a line to\n\
140 standard output. The default join field is the first, delimited\n\
141 by whitespace. When FILE1 or FILE2 (not both) is -, read standard input.\n\
143 -a FILENUM print unpairable lines coming from file FILENUM, where\n\
144 FILENUM is 1 or 2, corresponding to FILE1 or FILE2\n\
145 -e EMPTY replace missing input fields with EMPTY\n\
148 -i, --ignore-case ignore differences in case when comparing fields\n\
149 -j FIELD equivalent to `-1 FIELD -2 FIELD'\n\
150 -o FORMAT obey FORMAT while constructing output line\n\
151 -t CHAR use CHAR as input and output field separator\n\
154 -v FILENUM like -a FILENUM, but suppress joined output lines\n\
155 -1 FIELD join on this FIELD of file 1\n\
156 -2 FIELD join on this FIELD of file 2\n\
158 fputs (HELP_OPTION_DESCRIPTION, stdout);
159 fputs (VERSION_OPTION_DESCRIPTION, stdout);
162 Unless -t CHAR is given, leading blanks separate fields and are ignored,\n\
163 else fields are separated by CHAR. Any FIELD is a field number counted\n\
164 from 1. FORMAT is one or more comma or blank separated specifications,\n\
165 each being `FILENUM.FIELD' or `0'. Default FORMAT outputs the join field,\n\
166 the remaining fields from FILE1, the remaining fields from FILE2, all\n\
167 separated by CHAR.\n\
169 Important: FILE1 and FILE2 must be sorted on the join fields.\n\
170 E.g., use `sort -k 1b,1' if `join' has no options.\n\
172 printf (_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT);
177 /* Return true if C is a blank (a default input field separator). */
180 is_blank (unsigned char c)
182 return ISBLANK (c) != 0;
185 /* Record a field in LINE, with location FIELD and size LEN. */
188 extract_field (struct line *line, char *field, size_t len)
190 if (line->nfields >= line->nfields_allocated)
192 line->fields = X2NREALLOC (line->fields, &line->nfields_allocated);
194 line->fields[line->nfields].beg = field;
195 line->fields[line->nfields].len = len;
199 /* Fill in the `fields' structure in LINE. */
202 xfields (struct line *line)
204 char *ptr = line->buf.buffer;
205 char const *lim = ptr + line->buf.length - 1;
213 for (; (sep = memchr (ptr, tab, lim - ptr)) != NULL; ptr = sep + 1)
214 extract_field (line, ptr, sep - ptr);
218 /* Skip leading blanks before the first field. */
219 while (is_blank (*ptr))
226 for (sep = ptr + 1; sep != lim && ! is_blank (*sep); sep++)
228 extract_field (line, ptr, sep - ptr);
231 for (ptr = sep + 1; ptr != lim && is_blank (*ptr); ptr++)
237 extract_field (line, ptr, lim - ptr);
240 /* Read a line from FP into LINE and split it into fields.
241 Return true if successful. */
244 get_line (FILE *fp, struct line *line)
246 initbuffer (&line->buf);
248 if (! readlinebuffer (&line->buf, fp))
251 error (EXIT_FAILURE, errno, _("read error"));
252 free (line->buf.buffer);
253 line->buf.buffer = NULL;
257 line->nfields_allocated = 0;
265 freeline (struct line *line)
268 free (line->buf.buffer);
269 line->buf.buffer = NULL;
273 initseq (struct seq *seq)
280 /* Read a line from FP and add it to SEQ. Return true if successful. */
283 getseq (FILE *fp, struct seq *seq)
285 if (seq->count == seq->alloc)
286 seq->lines = X2NREALLOC (seq->lines, &seq->alloc);
288 if (get_line (fp, &seq->lines[seq->count]))
297 delseq (struct seq *seq)
300 for (i = 0; i < seq->count; i++)
301 if (seq->lines[i].buf.buffer)
302 freeline (&seq->lines[i]);
306 /* Return <0 if the join field in LINE1 compares less than the one in LINE2;
307 >0 if it compares greater; 0 if it compares equal.
308 Report an error and exit if the comparison fails. */
311 keycmp (struct line const *line1, struct line const *line2)
313 /* Start of field to compare in each file. */
318 size_t len2; /* Length of fields to compare. */
321 if (join_field_1 < line1->nfields)
323 beg1 = line1->fields[join_field_1].beg;
324 len1 = line1->fields[join_field_1].len;
332 if (join_field_2 < line2->nfields)
334 beg2 = line2->fields[join_field_2].beg;
335 len2 = line2->fields[join_field_2].len;
344 return len2 == 0 ? 0 : -1;
350 /* FIXME: ignore_case does not work with NLS (in particular,
351 with multibyte chars). */
352 diff = memcasecmp (beg1, beg2, MIN (len1, len2));
357 return xmemcoll (beg1, len1, beg2, len2);
358 diff = memcmp (beg1, beg2, MIN (len1, len2));
363 return len1 < len2 ? -1 : len1 != len2;
366 /* Print field N of LINE if it exists and is nonempty, otherwise
367 `empty_filler' if it is nonempty. */
370 prfield (size_t n, struct line const *line)
374 if (n < line->nfields)
376 len = line->fields[n].len;
378 fwrite (line->fields[n].beg, 1, len, stdout);
379 else if (empty_filler)
380 fputs (empty_filler, stdout);
382 else if (empty_filler)
383 fputs (empty_filler, stdout);
386 /* Print the join of LINE1 and LINE2. */
389 prjoin (struct line const *line1, struct line const *line2)
391 const struct outlist *outlist;
392 char output_separator = tab < 0 ? ' ' : tab;
394 outlist = outlist_head.next;
397 const struct outlist *o;
403 struct line const *line;
407 if (line1 == &uni_blank)
410 field = join_field_2;
415 field = join_field_1;
420 line = (o->file == 1 ? line1 : line2);
423 prfield (field, line);
427 putchar (output_separator);
435 if (line1 == &uni_blank)
437 struct line const *t;
442 prfield (join_field_1, line1);
443 for (i = 0; i < join_field_1 && i < line1->nfields; ++i)
445 putchar (output_separator);
448 for (i = join_field_1 + 1; i < line1->nfields; ++i)
450 putchar (output_separator);
454 for (i = 0; i < join_field_2 && i < line2->nfields; ++i)
456 putchar (output_separator);
459 for (i = join_field_2 + 1; i < line2->nfields; ++i)
461 putchar (output_separator);
468 /* Print the join of the files in FP1 and FP2. */
471 join (FILE *fp1, FILE *fp2)
473 struct seq seq1, seq2;
478 /* Read the first line of each file. */
484 while (seq1.count && seq2.count)
487 diff = keycmp (&seq1.lines[0], &seq2.lines[0]);
490 if (print_unpairables_1)
491 prjoin (&seq1.lines[0], &uni_blank);
492 freeline (&seq1.lines[0]);
499 if (print_unpairables_2)
500 prjoin (&uni_blank, &seq2.lines[0]);
501 freeline (&seq2.lines[0]);
507 /* Keep reading lines from file1 as long as they continue to
508 match the current line from file2. */
511 if (!getseq (fp1, &seq1))
517 while (!keycmp (&seq1.lines[seq1.count - 1], &seq2.lines[0]));
519 /* Keep reading lines from file2 as long as they continue to
520 match the current line from file1. */
523 if (!getseq (fp2, &seq2))
529 while (!keycmp (&seq1.lines[0], &seq2.lines[seq2.count - 1]));
533 for (i = 0; i < seq1.count - 1; ++i)
536 for (j = 0; j < seq2.count - 1; ++j)
537 prjoin (&seq1.lines[i], &seq2.lines[j]);
541 for (i = 0; i < seq1.count - 1; ++i)
542 freeline (&seq1.lines[i]);
545 seq1.lines[0] = seq1.lines[seq1.count - 1];
551 for (i = 0; i < seq2.count - 1; ++i)
552 freeline (&seq2.lines[i]);
555 seq2.lines[0] = seq2.lines[seq2.count - 1];
562 if (print_unpairables_1 && seq1.count)
564 prjoin (&seq1.lines[0], &uni_blank);
565 freeline (&seq1.lines[0]);
566 while (get_line (fp1, &line))
568 prjoin (&line, &uni_blank);
573 if (print_unpairables_2 && seq2.count)
575 prjoin (&uni_blank, &seq2.lines[0]);
576 freeline (&seq2.lines[0]);
577 while (get_line (fp2, &line))
579 prjoin (&uni_blank, &line);
588 /* Add a field spec for field FIELD of file FILE to `outlist'. */
591 add_field (int file, size_t field)
595 assert (file == 0 || file == 1 || file == 2);
596 assert (file != 0 || field == 0);
598 o = xmalloc (sizeof *o);
603 /* Add to the end of the list so the fields are in the right order. */
604 outlist_end->next = o;
608 /* Convert a string of decimal digits, STR (the 1-based join field number),
609 to an integral value. Upon successful conversion, return one less
610 (the zero-based field number). If it cannot be converted, give a
611 diagnostic and exit. */
614 string_to_join_field (char const *str)
617 unsigned long int val;
619 strtol_error s_err = xstrtoul (str, NULL, 10, &val, "");
620 if (s_err == LONGINT_OVERFLOW || (s_err == LONGINT_OK && SIZE_MAX < val))
622 error (EXIT_FAILURE, 0,
623 _("value %s is so large that it is not representable"),
627 if (s_err != LONGINT_OK || val == 0)
628 error (EXIT_FAILURE, 0, _("invalid field number: %s"), quote (str));
635 /* Convert a single field specifier string, S, to a *FILE_INDEX, *FIELD_INDEX
636 pair. In S, the field index string is 1-based; *FIELD_INDEX is zero-based.
637 If S is valid, return true. Otherwise, give a diagnostic and exit. */
640 decode_field_spec (const char *s, int *file_index, size_t *field_index)
642 /* The first character must be 0, 1, or 2. */
648 /* `0' must be all alone -- no `.FIELD'. */
649 error (EXIT_FAILURE, 0, _("invalid field specifier: %s"), quote (s));
658 error (EXIT_FAILURE, 0, _("invalid field specifier: %s"), quote (s));
659 *file_index = s[0] - '0';
660 *field_index = string_to_join_field (s + 2);
664 error (EXIT_FAILURE, 0,
665 _("invalid file number in field spec: %s"), quote (s));
667 /* Tell gcc -W -Wall that we can't get beyond this point.
668 This avoids a warning (otherwise legit) that the caller's copies
669 of *file_index and *field_index might be used uninitialized. */
676 /* Add the comma or blank separated field spec(s) in STR to `outlist'. */
679 add_field_list (char *str)
687 char const *spec_item = p;
689 p = strpbrk (p, ", \t");
692 decode_field_spec (spec_item, &file_index, &field_index);
693 add_field (file_index, field_index);
698 /* Set the join field *VAR to VAL, but report an error if *VAR is set
699 more than once to incompatible values. */
702 set_join_field (size_t *var, size_t val)
704 if (*var != SIZE_MAX && *var != val)
706 unsigned long int var1 = *var + 1;
707 unsigned long int val1 = val + 1;
708 error (EXIT_FAILURE, 0, _("incompatible join fields %lu, %lu"),
714 /* Status of command-line arguments. */
718 /* This argument must be an operand, i.e., one of the files to be
722 /* This might be the argument of the preceding -j1 or -j2 option,
723 or it might be an operand. */
727 /* This might be the argument of the preceding -o option, or it might be
732 /* Add NAME to the array of input file NAMES with operand statuses
733 OPERAND_STATUS; currently there are NFILES names in the list. */
736 add_file_name (char *name, char *names[2],
737 int operand_status[2], int joption_count[2], int *nfiles,
738 int *prev_optc_status, int *optc_status)
744 bool op0 = (operand_status[0] == MUST_BE_OPERAND);
745 char *arg = names[op0];
746 switch (operand_status[op0])
748 case MUST_BE_OPERAND:
749 error (0, 0, _("extra operand %s"), quote (name));
750 usage (EXIT_FAILURE);
752 case MIGHT_BE_J1_ARG:
754 set_join_field (&join_field_1, string_to_join_field (arg));
757 case MIGHT_BE_J2_ARG:
759 set_join_field (&join_field_2, string_to_join_field (arg));
763 add_field_list (arg);
768 operand_status[0] = operand_status[1];
774 operand_status[n] = *prev_optc_status;
777 if (*prev_optc_status == MIGHT_BE_O_ARG)
778 *optc_status = MIGHT_BE_O_ARG;
782 main (int argc, char **argv)
785 int prev_optc_status = MUST_BE_OPERAND;
786 int operand_status[2];
787 int joption_count[2] = { 0, 0 };
794 initialize_main (&argc, &argv);
795 program_name = argv[0];
796 setlocale (LC_ALL, "");
797 bindtextdomain (PACKAGE, LOCALEDIR);
798 textdomain (PACKAGE);
799 hard_LC_COLLATE = hard_locale (LC_COLLATE);
801 atexit (close_stdout);
803 print_pairables = true;
805 while ((optc = getopt_long (argc, argv, "-a:e:i1:2:j:o:t:v:",
809 optc_status = MUST_BE_OPERAND;
814 print_pairables = false;
819 unsigned long int val;
820 if (xstrtoul (optarg, NULL, 10, &val, "") != LONGINT_OK
821 || (val != 1 && val != 2))
822 error (EXIT_FAILURE, 0,
823 _("invalid field number: %s"), quote (optarg));
825 print_unpairables_1 = true;
827 print_unpairables_2 = true;
832 if (empty_filler && ! STREQ (empty_filler, optarg))
833 error (EXIT_FAILURE, 0,
834 _("conflicting empty-field replacement strings"));
835 empty_filler = optarg;
843 set_join_field (&join_field_1, string_to_join_field (optarg));
847 set_join_field (&join_field_2, string_to_join_field (optarg));
851 if ((optarg[0] == '1' || optarg[0] == '2') && !optarg[1]
852 && optarg == argv[optind - 1] + 2)
854 /* The argument was either "-j1" or "-j2". */
855 bool is_j2 = (optarg[0] == '2');
856 joption_count[is_j2]++;
857 optc_status = MIGHT_BE_J1_ARG + is_j2;
861 set_join_field (&join_field_1, string_to_join_field (optarg));
862 set_join_field (&join_field_2, join_field_1);
867 add_field_list (optarg);
868 optc_status = MIGHT_BE_O_ARG;
873 unsigned char newtab = optarg[0];
875 error (EXIT_FAILURE, 0, _("empty tab"));
878 if (STREQ (optarg, "\\0"))
881 error (EXIT_FAILURE, 0, _("multi-character tab %s"),
884 if (0 <= tab && tab != newtab)
885 error (EXIT_FAILURE, 0, _("incompatible tabs"));
890 case 1: /* Non-option argument. */
891 add_file_name (optarg, names, operand_status, joption_count,
892 &nfiles, &prev_optc_status, &optc_status);
895 case_GETOPT_HELP_CHAR;
897 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
900 usage (EXIT_FAILURE);
903 prev_optc_status = optc_status;
906 /* Process any operands after "--". */
907 prev_optc_status = MUST_BE_OPERAND;
908 while (optind < argc)
909 add_file_name (argv[optind++], names, operand_status, joption_count,
910 &nfiles, &prev_optc_status, &optc_status);
915 error (0, 0, _("missing operand"));
917 error (0, 0, _("missing operand after %s"), quote (argv[argc - 1]));
918 usage (EXIT_FAILURE);
921 /* If "-j1" was specified and it turns out not to have had an argument,
922 treat it as "-j 1". Likewise for -j2. */
923 for (i = 0; i < 2; i++)
924 if (joption_count[i] != 0)
926 set_join_field (&join_field_1, i);
927 set_join_field (&join_field_2, i);
930 if (join_field_1 == SIZE_MAX)
932 if (join_field_2 == SIZE_MAX)
935 fp1 = STREQ (names[0], "-") ? stdin : fopen (names[0], "r");
937 error (EXIT_FAILURE, errno, "%s", names[0]);
938 fp2 = STREQ (names[1], "-") ? stdin : fopen (names[1], "r");
940 error (EXIT_FAILURE, errno, "%s", names[1]);
942 error (EXIT_FAILURE, errno, _("both files cannot be standard input"));
945 if (fclose (fp1) != 0)
946 error (EXIT_FAILURE, errno, "%s", names[0]);
947 if (fclose (fp2) != 0)
948 error (EXIT_FAILURE, errno, "%s", names[1]);