1 /* join - join lines of two files on a common field
2 Copyright (C) 91, 1995-2001 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software Foundation,
16 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18 Written by Mike Haertel, mike@gnu.ai.mit.edu. */
24 #include <sys/types.h>
30 #include "hard-locale.h"
31 #include "linebuffer.h"
32 #include "memcasecmp.h"
36 /* The official name of this program (e.g., no `g' prefix). */
37 #define PROGRAM_NAME "join"
39 #define AUTHORS "Mike Haertel"
41 #define join system_join
43 /* Undefine, to avoid warning about redefinition on some systems. */
46 #define min(A, B) ((A) < (B) ? (A) : (B))
47 #define max(A, B) ((A) > (B) ? (A) : (B))
49 /* An element of the list identifying which fields to print for each
53 /* File number: 0, 1, or 2. 0 means use the join field.
54 1 means use the first file argument, 2 the second. */
57 /* Field index (zero-based), specified only when FILE is 1 or 2. */
63 /* A field of a line. */
66 const unsigned char *beg; /* First character in field. */
67 size_t len; /* The length of the field. */
70 /* A line read from an input file. */
73 struct linebuffer buf; /* The line itself. */
74 int nfields; /* Number of elements in `fields'. */
75 int nfields_allocated; /* Number of elements in `fields'. */
79 /* One or more consecutive lines read from a file that all have the
80 same join field value. */
83 int count; /* Elements used in `lines'. */
84 int alloc; /* Elements allocated in `lines'. */
88 /* The name this program was run with. */
92 /* Nonzero if the LC_COLLATE locale is hard. */
93 static int hard_LC_COLLATE;
96 /* If nonzero, print unpairable lines in file 1 or 2. */
97 static int print_unpairables_1, print_unpairables_2;
99 /* If nonzero, print pairable lines. */
100 static int print_pairables;
102 /* Empty output field filler. */
103 static char *empty_filler;
105 /* Field to join on. */
106 static int join_field_1, join_field_2;
108 /* List of fields to print. */
109 static struct outlist outlist_head;
111 /* Last element in `outlist', where a new element can be added. */
112 static struct outlist *outlist_end = &outlist_head;
114 /* Tab character separating fields; if this is NUL fields are separated
115 by any nonempty string of white space, otherwise by exactly one
117 static unsigned char tab;
119 /* When using getopt_long_only, no long option can start with
120 a character that is a short option. */
121 static struct option const longopts[] =
123 {"ignore-case", no_argument, NULL, 'i'},
124 {"j", required_argument, NULL, 'j'},
125 {"j1", required_argument, NULL, '1'},
126 {"j2", required_argument, NULL, '2'},
127 {GETOPT_HELP_OPTION_DECL},
128 {GETOPT_VERSION_OPTION_DECL},
132 /* Used to print non-joining lines */
133 static struct line uni_blank;
135 /* If nonzero, ignore case when comparing join fields. */
136 static int ignore_case;
142 fprintf (stderr, _("Try `%s --help' for more information.\n"),
147 Usage: %s [OPTION]... FILE1 FILE2\n\
151 For each pair of input lines with identical join fields, write a line to\n\
152 standard output. The default join field is the first, delimited\n\
153 by whitespace. When FILE1 or FILE2 (not both) is -, read standard input.\n\
155 -a SIDE print unpairable lines coming from file SIDE\n\
156 -e EMPTY replace missing input fields with EMPTY\n\
159 -i, --ignore-case ignore differences in case when comparing fields\n\
160 -j FIELD (obsolescent) equivalent to `-1 FIELD -2 FIELD'\n\
161 -j1 FIELD (obsolescent) equivalent to `-1 FIELD'\n\
162 -j2 FIELD (obsolescent) equivalent to `-2 FIELD'\n\
163 -o FORMAT obey FORMAT while constructing output line\n\
164 -t CHAR use CHAR as input and output field separator\n\
167 -v SIDE like -a SIDE, but suppress joined output lines\n\
168 -1 FIELD join on this FIELD of file 1\n\
169 -2 FIELD join on this FIELD of file 2\n\
172 --help display this help and exit\n\
173 --version output version information and exit\n\
179 Unless -t CHAR is given, leading blanks separate fields and are ignored,\n\
180 else fields are separated by CHAR. Any FIELD is a field number counted\n\
181 from 1. FORMAT is one or more comma or blank separated specifications,\n\
182 each being `SIDE.FIELD' or `0'. Default FORMAT outputs the join field,\n\
183 the remaining fields from FILE1, the remaining fields from FILE2, all\n\
184 separated by CHAR.\n\
186 puts (_("\nReport bugs to <bug-textutils@gnu.org>."));
188 exit (status == 0 ? EXIT_SUCCESS : EXIT_FAILURE);
192 ADD_FIELD (struct line *line, const unsigned char *field, size_t len)
194 if (line->nfields >= line->nfields_allocated)
196 line->nfields_allocated = (3 * line->nfields_allocated) / 2 + 1;
197 line->fields = (struct field *) xrealloc ((char *) line->fields,
198 (line->nfields_allocated
199 * sizeof (struct field)));
201 line->fields[line->nfields].beg = field;
202 line->fields[line->nfields].len = len;
206 /* Fill in the `fields' structure in LINE. */
209 xfields (struct line *line)
212 unsigned char *ptr0 = (unsigned char *) line->buf.buffer;
217 lim = ptr0 + line->buf.length - 1;
221 /* Skip leading blanks before the first field. */
222 while (ptr < lim && ISBLANK (*ptr))
226 for (i = 0; ptr < lim; ++i)
233 while (ptr < lim && *ptr != tab)
235 ADD_FIELD (line, beg, ptr - beg);
244 while (ptr < lim && !ISBLANK (*ptr))
246 ADD_FIELD (line, beg, ptr - beg);
247 while (ptr < lim && ISBLANK (*ptr))
252 if (ptr != ptr0 && ((!tab && ISBLANK (ptr[-1])) || ptr[-1] == tab))
254 /* Add one more (empty) field because the last character of the
255 line was a delimiter. */
256 ADD_FIELD (line, NULL, 0);
260 /* Read a line from FP into LINE and split it into fields.
261 Return 0 if EOF, 1 otherwise. */
264 get_line (FILE *fp, struct line *line)
266 initbuffer (&line->buf);
268 if (! readline (&line->buf, fp))
270 free (line->buf.buffer);
271 line->buf.buffer = NULL;
275 line->nfields_allocated = 0;
283 freeline (struct line *line)
285 free ((char *) line->fields);
286 free (line->buf.buffer);
287 line->buf.buffer = NULL;
291 initseq (struct seq *seq)
295 seq->lines = (struct line *) xmalloc (seq->alloc * sizeof (struct line));
298 /* Read a line from FP and add it to SEQ. Return 0 if EOF, 1 otherwise. */
301 getseq (FILE *fp, struct seq *seq)
303 if (seq->count == seq->alloc)
306 seq->lines = (struct line *)
307 xrealloc ((char *) seq->lines, seq->alloc * sizeof (struct line));
310 if (get_line (fp, &seq->lines[seq->count]))
319 delseq (struct seq *seq)
322 for (i = 0; i < seq->count; i++)
323 if (seq->lines[i].buf.buffer)
324 freeline (&seq->lines[i]);
325 free ((char *) seq->lines);
328 /* Return <0 if the join field in LINE1 compares less than the one in LINE2;
329 >0 if it compares greater; 0 if it compares equal. */
332 keycmp (struct line *line1, struct line *line2)
334 /* Start of field to compare in each file. */
335 const unsigned char *beg1, *beg2;
337 int len1, len2; /* Length of fields to compare. */
340 if (join_field_1 < line1->nfields)
342 beg1 = line1->fields[join_field_1].beg;
343 len1 = line1->fields[join_field_1].len;
351 if (join_field_2 < line2->nfields)
353 beg2 = line2->fields[join_field_2].beg;
354 len2 = line2->fields[join_field_2].len;
363 return len2 == 0 ? 0 : -1;
367 /* Use an if-statement here rather than a function variable to
368 avoid portability hassles of getting a non-conflicting declaration
372 /* FIXME: ignore_case does not work with NLS (in particular,
373 with multibyte chars). */
374 diff = memcasecmp (beg1, beg2, min (len1, len2));
380 return memcoll ((char *) beg1, len1, (char *) beg2, len2);
382 diff = memcmp (beg1, beg2, min (len1, len2));
390 /* Print field N of LINE if it exists and is nonempty, otherwise
391 `empty_filler' if it is nonempty. */
394 prfield (int n, struct line *line)
398 if (n < line->nfields)
400 len = line->fields[n].len;
402 fwrite (line->fields[n].beg, 1, len, stdout);
403 else if (empty_filler)
404 fputs (empty_filler, stdout);
406 else if (empty_filler)
407 fputs (empty_filler, stdout);
410 /* Print the join of LINE1 and LINE2. */
413 prjoin (struct line *line1, struct line *line2)
415 const struct outlist *outlist;
417 outlist = outlist_head.next;
420 const struct outlist *o;
430 if (line1 == &uni_blank)
433 field = join_field_2;
438 field = join_field_1;
443 line = (o->file == 1 ? line1 : line2);
444 assert (o->field >= 0);
447 prfield (field, line);
451 putchar (tab ? tab : ' ');
459 if (line1 == &uni_blank)
466 prfield (join_field_1, line1);
467 for (i = 0; i < join_field_1 && i < line1->nfields; ++i)
469 putchar (tab ? tab : ' ');
472 for (i = join_field_1 + 1; i < line1->nfields; ++i)
474 putchar (tab ? tab : ' ');
478 for (i = 0; i < join_field_2 && i < line2->nfields; ++i)
480 putchar (tab ? tab : ' ');
483 for (i = join_field_2 + 1; i < line2->nfields; ++i)
485 putchar (tab ? tab : ' ');
492 /* Print the join of the files in FP1 and FP2. */
495 join (FILE *fp1, FILE *fp2)
497 struct seq seq1, seq2;
499 int diff, i, j, eof1, eof2;
501 /* Read the first line of each file. */
507 while (seq1.count && seq2.count)
509 diff = keycmp (&seq1.lines[0], &seq2.lines[0]);
512 if (print_unpairables_1)
513 prjoin (&seq1.lines[0], &uni_blank);
514 freeline (&seq1.lines[0]);
521 if (print_unpairables_2)
522 prjoin (&uni_blank, &seq2.lines[0]);
523 freeline (&seq2.lines[0]);
529 /* Keep reading lines from file1 as long as they continue to
530 match the current line from file2. */
533 if (!getseq (fp1, &seq1))
539 while (!keycmp (&seq1.lines[seq1.count - 1], &seq2.lines[0]));
541 /* Keep reading lines from file2 as long as they continue to
542 match the current line from file1. */
545 if (!getseq (fp2, &seq2))
551 while (!keycmp (&seq1.lines[0], &seq2.lines[seq2.count - 1]));
555 for (i = 0; i < seq1.count - 1; ++i)
556 for (j = 0; j < seq2.count - 1; ++j)
557 prjoin (&seq1.lines[i], &seq2.lines[j]);
560 for (i = 0; i < seq1.count - 1; ++i)
561 freeline (&seq1.lines[i]);
564 seq1.lines[0] = seq1.lines[seq1.count - 1];
570 for (i = 0; i < seq2.count - 1; ++i)
571 freeline (&seq2.lines[i]);
574 seq2.lines[0] = seq2.lines[seq2.count - 1];
581 if (print_unpairables_1 && seq1.count)
583 prjoin (&seq1.lines[0], &uni_blank);
584 freeline (&seq1.lines[0]);
585 while (get_line (fp1, &line))
587 prjoin (&line, &uni_blank);
592 if (print_unpairables_2 && seq2.count)
594 prjoin (&uni_blank, &seq2.lines[0]);
595 freeline (&seq2.lines[0]);
596 while (get_line (fp2, &line))
598 prjoin (&uni_blank, &line);
607 /* Add a field spec for field FIELD of file FILE to `outlist'. */
610 add_field (int file, int field)
614 assert (file == 0 || file == 1 || file == 2);
615 assert (file == 0 ? field < 0 : field >= 0);
617 o = (struct outlist *) xmalloc (sizeof (struct outlist));
622 /* Add to the end of the list so the fields are in the right order. */
623 outlist_end->next = o;
627 /* Convert a single field specifier string, S, to a *FILE_INDEX, *FIELD_INDEX
628 pair. In S, the field index string is 1-based; *FIELD_INDEX is zero-based.
629 If S is valid, return zero. Otherwise, give a diagnostic, don't update
630 *FILE_INDEX or *FIELD_INDEX, and return nonzero. */
633 decode_field_spec (const char *s, int *file_index, int *field_index)
637 /* The first character must be 0, 1, or 2. */
644 /* Give *field_index an invalid value. */
650 /* `0' must be all alone -- no `.FIELD'. */
651 error (0, 0, _("invalid field specifier: `%s'"), s);
657 if (s[1] == '.' && s[2] != '\0')
662 s_err = xstrtol (s + 2, NULL, 10, &tmp_long, "");
663 if (s_err != LONGINT_OK || tmp_long <= 0 || tmp_long > INT_MAX)
665 error (0, 0, _("invalid field number: `%s'"), s + 2);
669 *file_index = s[0] - '0';
670 /* Convert to a zero-based index. */
671 *field_index = (int) tmp_long - 1;
678 error (0, 0, _("invalid file number in field spec: `%s'"), s);
684 /* Add the comma or blank separated field spec(s) in STR to `outlist'.
685 Return nonzero to indicate failure. */
688 add_field_list (const char *c_str)
692 /* Make a writable copy of c_str. */
693 str = (char *) alloca (strlen (c_str) + 1);
700 int file_index, field_index;
703 p = strpbrk (p, ", \t");
706 invalid = decode_field_spec (spec_item, &file_index, &field_index);
709 add_field (file_index, field_index);
710 uni_blank.nfields = max (uni_blank.nfields, field_index);
716 /* Create a blank line with COUNT fields separated by tabs. */
719 make_blank (struct line *blank, int count)
722 unsigned char *buffer;
723 struct field *fields;
724 blank->nfields = count;
725 blank->buf.size = blank->buf.length = count + 1;
726 blank->buf.buffer = xmalloc (blank->buf.size);
727 buffer = (unsigned char *) blank->buf.buffer;
728 blank->fields = fields =
729 (struct field *) xmalloc (sizeof (struct field) * count);
730 for (i = 0; i < count; i++)
733 fields[i].beg = &buffer[i];
740 main (int argc, char **argv)
744 int optc, prev_optc = 0, nfiles;
746 program_name = argv[0];
747 setlocale (LC_ALL, "");
748 bindtextdomain (PACKAGE, LOCALEDIR);
749 textdomain (PACKAGE);
751 atexit (close_stdout);
754 hard_LC_COLLATE = hard_locale (LC_COLLATE);
757 /* Initialize this before parsing options. In parsing options,
758 it may be increased. */
759 uni_blank.nfields = 1;
764 while ((optc = getopt_long_only (argc, argv, "-a:e:i1:2:o:t:v:", longopts,
779 if (xstrtol (optarg, NULL, 10, &val, "") != LONGINT_OK
780 || (val != 1 && val != 2))
781 error (EXIT_FAILURE, 0, _("invalid field number: `%s'"), optarg);
783 print_unpairables_1 = 1;
785 print_unpairables_2 = 1;
789 empty_filler = optarg;
797 if (xstrtol (optarg, NULL, 10, &val, "") != LONGINT_OK
798 || val <= 0 || val > INT_MAX)
800 error (EXIT_FAILURE, 0,
801 _("invalid field number for file 1: `%s'"), optarg);
803 join_field_1 = (int) val - 1;
807 if (xstrtol (optarg, NULL, 10, &val, "") != LONGINT_OK
808 || val <= 0 || val > INT_MAX)
809 error (EXIT_FAILURE, 0,
810 _("invalid field number for file 2: `%s'"), optarg);
811 join_field_2 = (int) val - 1;
815 if (xstrtol (optarg, NULL, 10, &val, "") != LONGINT_OK
816 || val <= 0 || val > INT_MAX)
817 error (EXIT_FAILURE, 0, _("invalid field number: `%s'"), optarg);
818 join_field_1 = join_field_2 = (int) val - 1;
822 if (add_field_list (optarg))
830 case 1: /* Non-option argument. */
831 if (prev_optc == 'o' && optind <= argc - 2)
833 if (add_field_list (optarg))
836 /* Might be continuation of args to -o. */
837 continue; /* Don't change `prev_optc'. */
842 error (0, 0, _("too many non-option arguments"));
845 names[nfiles++] = optarg;
848 case_GETOPT_HELP_CHAR;
850 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
858 /* Now that we've seen the options, we can construct the blank line
860 make_blank (&uni_blank, uni_blank.nfields);
864 error (0, 0, _("too few non-option arguments"));
868 fp1 = STREQ (names[0], "-") ? stdin : fopen (names[0], "r");
870 error (EXIT_FAILURE, errno, "%s", names[0]);
871 fp2 = STREQ (names[1], "-") ? stdin : fopen (names[1], "r");
873 error (EXIT_FAILURE, errno, "%s", names[1]);
875 error (EXIT_FAILURE, errno, _("both files cannot be standard input"));
878 if (fp1 != stdin && fclose (fp1) == EOF)
879 error (EXIT_FAILURE, errno, "%s", names[0]);
880 if (fp2 != stdin && fclose (fp2) == EOF)
881 error (EXIT_FAILURE, errno, "%s", names[1]);
882 if ((fp1 == stdin || fp2 == stdin) && fclose (stdin) == EOF)
883 error (EXIT_FAILURE, errno, "-");