1 /* join - join lines of two files on a common field
2 Copyright (C) 91, 1995-2005 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software Foundation,
16 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18 Written by Mike Haertel, mike@gnu.ai.mit.edu. */
24 #include <sys/types.h>
29 #include "hard-locale.h"
30 #include "linebuffer.h"
31 #include "memcasecmp.h"
33 #include "stdio-safer.h"
37 /* The official name of this program (e.g., no `g' prefix). */
38 #define PROGRAM_NAME "join"
40 #define AUTHORS "Mike Haertel"
42 #define join system_join
44 /* An element of the list identifying which fields to print for each
48 /* File number: 0, 1, or 2. 0 means use the join field.
49 1 means use the first file argument, 2 the second. */
52 /* Field index (zero-based), specified only when FILE is 1 or 2. */
58 /* A field of a line. */
61 char *beg; /* First character in field. */
62 size_t len; /* The length of the field. */
65 /* A line read from an input file. */
68 struct linebuffer buf; /* The line itself. */
69 size_t nfields; /* Number of elements in `fields'. */
70 size_t nfields_allocated; /* Number of elements allocated for `fields'. */
74 /* One or more consecutive lines read from a file that all have the
75 same join field value. */
78 size_t count; /* Elements used in `lines'. */
79 size_t alloc; /* Elements allocated in `lines'. */
83 /* The name this program was run with. */
86 /* True if the LC_COLLATE locale is hard. */
87 static bool hard_LC_COLLATE;
89 /* If nonzero, print unpairable lines in file 1 or 2. */
90 static bool print_unpairables_1, print_unpairables_2;
92 /* If nonzero, print pairable lines. */
93 static bool print_pairables;
95 /* Empty output field filler. */
96 static char const *empty_filler;
98 /* Field to join on; SIZE_MAX means they haven't been determined yet. */
99 static size_t join_field_1 = SIZE_MAX;
100 static size_t join_field_2 = SIZE_MAX;
102 /* List of fields to print. */
103 static struct outlist outlist_head;
105 /* Last element in `outlist', where a new element can be added. */
106 static struct outlist *outlist_end = &outlist_head;
108 /* Tab character separating fields. If negative, fields are separated
109 by any nonempty string of blanks, otherwise by exactly one
110 tab character whose value (when cast to unsigned char) equals TAB. */
113 static struct option const longopts[] =
115 {"ignore-case", no_argument, NULL, 'i'},
116 {GETOPT_HELP_OPTION_DECL},
117 {GETOPT_VERSION_OPTION_DECL},
121 /* Used to print non-joining lines */
122 static struct line uni_blank;
124 /* If nonzero, ignore case when comparing join fields. */
125 static bool ignore_case;
130 if (status != EXIT_SUCCESS)
131 fprintf (stderr, _("Try `%s --help' for more information.\n"),
136 Usage: %s [OPTION]... FILE1 FILE2\n\
140 For each pair of input lines with identical join fields, write a line to\n\
141 standard output. The default join field is the first, delimited\n\
142 by whitespace. When FILE1 or FILE2 (not both) is -, read standard input.\n\
144 -a FILENUM print unpairable lines coming from file FILENUM, where\n\
145 FILENUM is 1 or 2, corresponding to FILE1 or FILE2\n\
146 -e EMPTY replace missing input fields with EMPTY\n\
149 -i, --ignore-case ignore differences in case when comparing fields\n\
150 -j FIELD equivalent to `-1 FIELD -2 FIELD'\n\
151 -o FORMAT obey FORMAT while constructing output line\n\
152 -t CHAR use CHAR as input and output field separator\n\
155 -v FILENUM like -a FILENUM, but suppress joined output lines\n\
156 -1 FIELD join on this FIELD of file 1\n\
157 -2 FIELD join on this FIELD of file 2\n\
159 fputs (HELP_OPTION_DESCRIPTION, stdout);
160 fputs (VERSION_OPTION_DESCRIPTION, stdout);
163 Unless -t CHAR is given, leading blanks separate fields and are ignored,\n\
164 else fields are separated by CHAR. Any FIELD is a field number counted\n\
165 from 1. FORMAT is one or more comma or blank separated specifications,\n\
166 each being `FILENUM.FIELD' or `0'. Default FORMAT outputs the join field,\n\
167 the remaining fields from FILE1, the remaining fields from FILE2, all\n\
168 separated by CHAR.\n\
170 Important: FILE1 and FILE2 must be sorted on the join fields.\n\
172 printf (_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT);
177 /* Return true if C is a blank (a default input field separator). */
180 is_blank (unsigned char c)
182 return ISBLANK (c) != 0;
185 /* Record a field in LINE, with location FIELD and size LEN. */
188 extract_field (struct line *line, char *field, size_t len)
190 if (line->nfields >= line->nfields_allocated)
192 line->fields = x2nrealloc (line->fields, &line->nfields_allocated,
193 sizeof (struct field));
195 line->fields[line->nfields].beg = field;
196 line->fields[line->nfields].len = len;
200 /* Fill in the `fields' structure in LINE. */
203 xfields (struct line *line)
205 char *ptr = line->buf.buffer;
206 char const *lim = ptr + line->buf.length - 1;
214 for (; (sep = memchr (ptr, tab, lim - ptr)) != NULL; ptr = sep + 1)
215 extract_field (line, ptr, sep - ptr);
219 /* Skip leading blanks before the first field. */
220 while (is_blank (*ptr))
227 for (sep = ptr + 1; sep != lim && ! is_blank (*sep); sep++)
229 extract_field (line, ptr, sep - ptr);
232 for (ptr = sep + 1; ptr != lim && is_blank (*ptr); ptr++)
238 extract_field (line, ptr, lim - ptr);
241 /* Read a line from FP into LINE and split it into fields.
242 Return true if successful. */
245 get_line (FILE *fp, struct line *line)
247 initbuffer (&line->buf);
249 if (! readlinebuffer (&line->buf, fp))
252 error (EXIT_FAILURE, errno, _("read error"));
253 free (line->buf.buffer);
254 line->buf.buffer = NULL;
258 line->nfields_allocated = 0;
266 freeline (struct line *line)
269 free (line->buf.buffer);
270 line->buf.buffer = NULL;
274 initseq (struct seq *seq)
281 /* Read a line from FP and add it to SEQ. Return true if successful. */
284 getseq (FILE *fp, struct seq *seq)
286 if (seq->count == seq->alloc)
287 seq->lines = x2nrealloc (seq->lines, &seq->alloc, sizeof *seq->lines);
289 if (get_line (fp, &seq->lines[seq->count]))
298 delseq (struct seq *seq)
301 for (i = 0; i < seq->count; i++)
302 if (seq->lines[i].buf.buffer)
303 freeline (&seq->lines[i]);
307 /* Return <0 if the join field in LINE1 compares less than the one in LINE2;
308 >0 if it compares greater; 0 if it compares equal.
309 Report an error and exit if the comparison fails. */
312 keycmp (struct line const *line1, struct line const *line2)
314 /* Start of field to compare in each file. */
319 size_t len2; /* Length of fields to compare. */
322 if (join_field_1 < line1->nfields)
324 beg1 = line1->fields[join_field_1].beg;
325 len1 = line1->fields[join_field_1].len;
333 if (join_field_2 < line2->nfields)
335 beg2 = line2->fields[join_field_2].beg;
336 len2 = line2->fields[join_field_2].len;
345 return len2 == 0 ? 0 : -1;
351 /* FIXME: ignore_case does not work with NLS (in particular,
352 with multibyte chars). */
353 diff = memcasecmp (beg1, beg2, MIN (len1, len2));
358 return xmemcoll (beg1, len1, beg2, len2);
359 diff = memcmp (beg1, beg2, MIN (len1, len2));
364 return len1 < len2 ? -1 : len1 != len2;
367 /* Print field N of LINE if it exists and is nonempty, otherwise
368 `empty_filler' if it is nonempty. */
371 prfield (size_t n, struct line const *line)
375 if (n < line->nfields)
377 len = line->fields[n].len;
379 fwrite (line->fields[n].beg, 1, len, stdout);
380 else if (empty_filler)
381 fputs (empty_filler, stdout);
383 else if (empty_filler)
384 fputs (empty_filler, stdout);
387 /* Print the join of LINE1 and LINE2. */
390 prjoin (struct line const *line1, struct line const *line2)
392 const struct outlist *outlist;
393 char output_separator = tab < 0 ? ' ' : tab;
395 outlist = outlist_head.next;
398 const struct outlist *o;
404 struct line const *line;
408 if (line1 == &uni_blank)
411 field = join_field_2;
416 field = join_field_1;
421 line = (o->file == 1 ? line1 : line2);
424 prfield (field, line);
428 putchar (output_separator);
436 if (line1 == &uni_blank)
438 struct line const *t;
443 prfield (join_field_1, line1);
444 for (i = 0; i < join_field_1 && i < line1->nfields; ++i)
446 putchar (output_separator);
449 for (i = join_field_1 + 1; i < line1->nfields; ++i)
451 putchar (output_separator);
455 for (i = 0; i < join_field_2 && i < line2->nfields; ++i)
457 putchar (output_separator);
460 for (i = join_field_2 + 1; i < line2->nfields; ++i)
462 putchar (output_separator);
469 /* Print the join of the files in FP1 and FP2. */
472 join (FILE *fp1, FILE *fp2)
474 struct seq seq1, seq2;
479 /* Read the first line of each file. */
485 while (seq1.count && seq2.count)
488 diff = keycmp (&seq1.lines[0], &seq2.lines[0]);
491 if (print_unpairables_1)
492 prjoin (&seq1.lines[0], &uni_blank);
493 freeline (&seq1.lines[0]);
500 if (print_unpairables_2)
501 prjoin (&uni_blank, &seq2.lines[0]);
502 freeline (&seq2.lines[0]);
508 /* Keep reading lines from file1 as long as they continue to
509 match the current line from file2. */
512 if (!getseq (fp1, &seq1))
518 while (!keycmp (&seq1.lines[seq1.count - 1], &seq2.lines[0]));
520 /* Keep reading lines from file2 as long as they continue to
521 match the current line from file1. */
524 if (!getseq (fp2, &seq2))
530 while (!keycmp (&seq1.lines[0], &seq2.lines[seq2.count - 1]));
534 for (i = 0; i < seq1.count - 1; ++i)
537 for (j = 0; j < seq2.count - 1; ++j)
538 prjoin (&seq1.lines[i], &seq2.lines[j]);
542 for (i = 0; i < seq1.count - 1; ++i)
543 freeline (&seq1.lines[i]);
546 seq1.lines[0] = seq1.lines[seq1.count - 1];
552 for (i = 0; i < seq2.count - 1; ++i)
553 freeline (&seq2.lines[i]);
556 seq2.lines[0] = seq2.lines[seq2.count - 1];
563 if (print_unpairables_1 && seq1.count)
565 prjoin (&seq1.lines[0], &uni_blank);
566 freeline (&seq1.lines[0]);
567 while (get_line (fp1, &line))
569 prjoin (&line, &uni_blank);
574 if (print_unpairables_2 && seq2.count)
576 prjoin (&uni_blank, &seq2.lines[0]);
577 freeline (&seq2.lines[0]);
578 while (get_line (fp2, &line))
580 prjoin (&uni_blank, &line);
589 /* Add a field spec for field FIELD of file FILE to `outlist'. */
592 add_field (int file, size_t field)
596 assert (file == 0 || file == 1 || file == 2);
597 assert (file != 0 || field == 0);
599 o = xmalloc (sizeof *o);
604 /* Add to the end of the list so the fields are in the right order. */
605 outlist_end->next = o;
609 /* Convert a string of decimal digits, STR (the 1-based join field number),
610 to an integral value. Upon successful conversion, return one less
611 (the zero-based field number). If it cannot be converted, give a
612 diagnostic and exit. */
615 string_to_join_field (char const *str)
618 unsigned long int val;
620 strtol_error s_err = xstrtoul (str, NULL, 10, &val, "");
621 if (s_err == LONGINT_OVERFLOW || (s_err == LONGINT_OK && SIZE_MAX < val))
623 error (EXIT_FAILURE, 0,
624 _("value %s is so large that it is not representable"),
628 if (s_err != LONGINT_OK || val == 0)
629 error (EXIT_FAILURE, 0, _("invalid field number: %s"), quote (str));
636 /* Convert a single field specifier string, S, to a *FILE_INDEX, *FIELD_INDEX
637 pair. In S, the field index string is 1-based; *FIELD_INDEX is zero-based.
638 If S is valid, return true. Otherwise, give a diagnostic and exit. */
641 decode_field_spec (const char *s, int *file_index, size_t *field_index)
643 /* The first character must be 0, 1, or 2. */
649 /* `0' must be all alone -- no `.FIELD'. */
650 error (EXIT_FAILURE, 0, _("invalid field specifier: %s"), quote (s));
659 error (EXIT_FAILURE, 0, _("invalid field specifier: %s"), quote (s));
660 *file_index = s[0] - '0';
661 *field_index = string_to_join_field (s + 2);
665 error (EXIT_FAILURE, 0,
666 _("invalid file number in field spec: %s"), quote (s));
671 /* Add the comma or blank separated field spec(s) in STR to `outlist'. */
674 add_field_list (char *str)
682 char const *spec_item = p;
684 p = strpbrk (p, ", \t");
687 decode_field_spec (spec_item, &file_index, &field_index);
688 add_field (file_index, field_index);
693 /* Set the join field *VAR to VAL, but report an error if *VAR is set
694 more than once to incompatible values. */
697 set_join_field (size_t *var, size_t val)
699 if (*var != SIZE_MAX && *var != val)
701 unsigned long int var1 = *var + 1;
702 unsigned long int val1 = val + 1;
703 error (EXIT_FAILURE, 0, _("incompatible join fields %lu, %lu"),
709 /* Status of command-line arguments. */
713 /* This argument must be an operand, i.e., one of the files to be
717 /* This might be the argument of the preceding -j1 or -j2 option,
718 or it might be an operand. */
722 /* This might be the argument of the preceding -o option, or it might be
727 /* Add NAME to the array of input file NAMES with operand statuses
728 OPERAND_STATUS; currently there are NFILES names in the list. */
731 add_file_name (char *name, char *names[2],
732 int operand_status[2], int joption_count[2], int *nfiles,
733 int *prev_optc_status, int *optc_status)
739 bool op0 = (operand_status[0] == MUST_BE_OPERAND);
740 char *arg = names[op0];
741 switch (operand_status[op0])
743 case MUST_BE_OPERAND:
744 error (0, 0, _("extra operand %s"), quote (name));
745 usage (EXIT_FAILURE);
747 case MIGHT_BE_J1_ARG:
749 set_join_field (&join_field_1, string_to_join_field (arg));
752 case MIGHT_BE_J2_ARG:
754 set_join_field (&join_field_2, string_to_join_field (arg));
758 add_field_list (arg);
763 operand_status[0] = operand_status[1];
769 operand_status[n] = *prev_optc_status;
772 if (*prev_optc_status == MIGHT_BE_O_ARG)
773 *optc_status = MIGHT_BE_O_ARG;
777 main (int argc, char **argv)
780 int prev_optc_status = MUST_BE_OPERAND;
781 int operand_status[2];
782 int joption_count[2] = { 0, 0 };
789 initialize_main (&argc, &argv);
790 program_name = argv[0];
791 setlocale (LC_ALL, "");
792 bindtextdomain (PACKAGE, LOCALEDIR);
793 textdomain (PACKAGE);
794 hard_LC_COLLATE = hard_locale (LC_COLLATE);
796 atexit (close_stdout);
798 print_pairables = true;
800 while ((optc = getopt_long (argc, argv, "-a:e:i1:2:j:o:t:v:",
804 optc_status = MUST_BE_OPERAND;
809 print_pairables = false;
814 unsigned long int val;
815 if (xstrtoul (optarg, NULL, 10, &val, "") != LONGINT_OK
816 || (val != 1 && val != 2))
817 error (EXIT_FAILURE, 0,
818 _("invalid field number: %s"), quote (optarg));
820 print_unpairables_1 = true;
822 print_unpairables_2 = true;
827 if (empty_filler && ! STREQ (empty_filler, optarg))
828 error (EXIT_FAILURE, 0,
829 _("conflicting empty-field replacement strings"));
830 empty_filler = optarg;
838 set_join_field (&join_field_1, string_to_join_field (optarg));
842 set_join_field (&join_field_2, string_to_join_field (optarg));
846 if ((optarg[0] == '1' || optarg[0] == '2') && !optarg[1]
847 && optarg == argv[optind - 1] + 2)
849 /* The argument was either "-j1" or "-j2". */
850 bool is_j2 = (optarg[0] == '2');
851 joption_count[is_j2]++;
852 optc_status = MIGHT_BE_J1_ARG + is_j2;
856 set_join_field (&join_field_1, string_to_join_field (optarg));
857 set_join_field (&join_field_2, join_field_1);
862 add_field_list (optarg);
863 optc_status = MIGHT_BE_O_ARG;
868 unsigned char newtab = optarg[0];
870 error (EXIT_FAILURE, 0, _("empty tab"));
873 if (STREQ (optarg, "\\0"))
876 error (EXIT_FAILURE, 0, _("multi-character tab `%s'"),
879 if (0 <= tab && tab != newtab)
880 error (EXIT_FAILURE, 0, _("incompatible tabs"));
885 case 1: /* Non-option argument. */
886 add_file_name (optarg, names, operand_status, joption_count,
887 &nfiles, &prev_optc_status, &optc_status);
890 case_GETOPT_HELP_CHAR;
892 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
895 usage (EXIT_FAILURE);
898 prev_optc_status = optc_status;
901 /* Process any operands after "--". */
902 prev_optc_status = MUST_BE_OPERAND;
903 while (optind < argc)
904 add_file_name (argv[optind++], names, operand_status, joption_count,
905 &nfiles, &prev_optc_status, &optc_status);
910 error (0, 0, _("missing operand"));
912 error (0, 0, _("missing operand after %s"), quote (argv[argc - 1]));
913 usage (EXIT_FAILURE);
916 /* If "-j1" was specified and it turns out not to have had an argument,
917 treat it as "-j 1". Likewise for -j2. */
918 for (i = 0; i < 2; i++)
919 if (joption_count[i] != 0)
921 set_join_field (&join_field_1, i);
922 set_join_field (&join_field_2, i);
925 if (join_field_1 == SIZE_MAX)
927 if (join_field_2 == SIZE_MAX)
930 fp1 = STREQ (names[0], "-") ? stdin : fopen_safer (names[0], "r");
932 error (EXIT_FAILURE, errno, "%s", names[0]);
933 fp2 = STREQ (names[1], "-") ? stdin : fopen_safer (names[1], "r");
935 error (EXIT_FAILURE, errno, "%s", names[1]);
937 error (EXIT_FAILURE, errno, _("both files cannot be standard input"));
940 if (fclose (fp1) != 0)
941 error (EXIT_FAILURE, errno, "%s", names[0]);
942 if (fclose (fp2) != 0)
943 error (EXIT_FAILURE, errno, "%s", names[1]);