1 /* join - join lines of two files on a common field
2 Copyright (C) 1991, 1995 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2, or (at your option)
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
18 Written by Mike Haertel, mike@gnu.ai.mit.edu. */
23 #define alloca __builtin_alloca
24 #else /* not __GNUC__ */
27 #else /* not HAVE_ALLOCA_H */
33 #endif /* not HAVE_ALLOCA_H */
34 #endif /* not __GNUC__ */
36 /* Get isblank from GNU libc. */
42 #include <sys/types.h>
50 # define UINT_MAX ((unsigned int) ~(unsigned int) 0)
54 # define INT_MAX ((int) (UINT_MAX >> 1))
59 #include "long-options.h"
63 #define join system_join
68 /* Undefine, to avoid warning about redefinition on some systems. */
71 #define min(A, B) ((A) < (B) ? (A) : (B))
72 #define max(A, B) ((A) > (B) ? (A) : (B))
74 /* An element of the list identifying which fields to print for each
78 /* File number: 0, 1, or 2. 0 means use the join field.
79 1 means use the first file argument, 2 the second. */
82 /* Field index (zero-based), specified only when FILE is 1 or 2. */
88 /* A field of a line. */
91 const char *beg; /* First character in field. */
92 size_t len; /* The length of the field. */
95 /* A line read from an input file. Newlines are not stored. */
98 char *beg; /* First character in line. */
99 char *lim; /* Character after last character in line. */
100 int nfields; /* Number of elements in `fields'. */
101 int nfields_allocated; /* Number of elements in `fields'. */
102 struct field *fields;
105 /* One or more consecutive lines read from a file that all have the
106 same join field value. */
109 int count; /* Elements used in `lines'. */
110 int alloc; /* Elements allocated in `lines'. */
114 /* The name this program was run with. */
117 /* If nonzero, print unpairable lines in file 1 or 2. */
118 static int print_unpairables_1, print_unpairables_2;
120 /* If nonzero, print pairable lines. */
121 static int print_pairables;
123 /* Empty output field filler. */
124 static char *empty_filler;
126 /* Field to join on. */
127 static int join_field_1, join_field_2;
129 /* List of fields to print. */
130 static struct outlist outlist_head;
132 /* Last element in `outlist', where a new element can be added. */
133 static struct outlist *outlist_end = &outlist_head;
135 /* Tab character separating fields; if this is NUL fields are separated
136 by any nonempty string of white space, otherwise by exactly one
140 /* When using getopt_long_only, no long option can start with
141 a character that is a short option. */
142 static struct option const longopts[] =
144 {"j", required_argument, NULL, 'j'},
145 {"j1", required_argument, NULL, '1'},
146 {"j2", required_argument, NULL, '2'},
150 /* Used to print non-joining lines */
151 static struct line uni_blank;
157 fprintf (stderr, _("Try `%s --help' for more information.\n"),
162 Usage: %s [OPTION]... FILE1 FILE2\n\
166 For each pair of input lines with identical join fields, write a line to\n\
167 standard output. The default join field is the first, delimited\n\
168 by whitespace. When FILE1 or FILE2 (not both) is -, read standard input.\n\
170 -a SIDE print unpairable lines coming from file SIDE\n\
171 -e EMPTY replace missing input fields with EMPTY\n\
172 -j FIELD (Obsolescent) equivalent to `-1 FIELD -2 FIELD'\n\
173 -j1 FIELD (Obsolescent) equivalent to `-1 FIELD'\n\
174 -j2 FIELD (Obsolescent) equivalent to `-2 FIELD'\n\
175 -1 FIELD join on this FIELD of file 1\n\
176 -2 FIELD join on this FIELD of file 2\n\
177 -o FORMAT obey FORMAT while constructing output line\n\
178 -t CHAR use CHAR as input and output field separator\n\
179 -v SIDE like -a SIDE, but suppress joined output lines\n\
180 --help display this help and exit\n\
181 --version output version information and exit\n\
183 Unless -t CHAR is given, leading blanks separate fields and are ignored,\n\
184 else fields are separated by CHAR. Any FIELD is a field number counted\n\
185 from 1. FORMAT is one or more comma or blank separated specifications,\n\
186 each being `SIDE.FIELD' or `0'. Default FORMAT outputs the join field,\n\
187 the remaining fields from FILE1, the remaining fields from FILE2, all\n\
188 separated by CHAR.\n\
195 ADD_FIELD (struct line *line, const char *field, size_t len)
197 if (line->nfields >= line->nfields_allocated)
199 line->nfields_allocated = (3 * line->nfields_allocated) / 2 + 1;
200 line->fields = (struct field *) xrealloc ((char *) line->fields,
201 (line->nfields_allocated
202 * sizeof (struct field)));
204 line->fields[line->nfields].beg = field;
205 line->fields[line->nfields].len = len;
209 /* Fill in the `fields' structure in LINE. */
212 xfields (struct line *line)
215 register char *ptr, *lim;
220 for (i = 0; ptr < lim; ++i)
227 while (ptr < lim && *ptr != tab)
229 ADD_FIELD (line, beg, ptr - beg);
238 while (ptr < lim && !ISSPACE (*ptr))
240 ADD_FIELD (line, beg, ptr - beg);
241 while (ptr < lim && ISSPACE (*ptr))
246 if (ptr > line->beg && ((tab && ISSPACE (ptr[-1])) || ptr[-1] == tab))
248 /* Add one more (empty) field because the last character of the
249 line was a delimiter. */
250 ADD_FIELD (line, NULL, 0);
254 /* Read a line from FP into LINE and split it into fields.
255 Return 0 if EOF, 1 otherwise. */
258 get_line (FILE *fp, struct line *line)
260 static int linesize = 80;
267 ptr = xmalloc (linesize);
269 for (i = 0; (c = getc (fp)) != EOF && c != '\n'; ++i)
274 ptr = xrealloc (ptr, linesize);
279 if (c == EOF && i == 0)
286 line->lim = line->beg + i;
287 line->nfields_allocated = 0;
295 freeline (struct line *line)
297 free ((char *) line->fields);
303 initseq (struct seq *seq)
307 seq->lines = (struct line *) xmalloc (seq->alloc * sizeof (struct line));
310 /* Read a line from FP and add it to SEQ. Return 0 if EOF, 1 otherwise. */
313 getseq (FILE *fp, struct seq *seq)
315 if (seq->count == seq->alloc)
318 seq->lines = (struct line *)
319 xrealloc ((char *) seq->lines, seq->alloc * sizeof (struct line));
322 if (get_line (fp, &seq->lines[seq->count]))
331 delseq (struct seq *seq)
334 for (i = 0; i < seq->count; i++)
335 if (seq->lines[i].beg)
336 freeline (&seq->lines[i]);
337 free ((char *) seq->lines);
340 /* Return <0 if the join field in LINE1 compares less than the one in LINE2;
341 >0 if it compares greater; 0 if it compares equal. */
344 keycmp (struct line *line1, struct line *line2)
346 const char *beg1, *beg2; /* Start of field to compare in each file. */
347 int len1, len2; /* Length of fields to compare. */
350 if (join_field_1 < line1->nfields)
352 beg1 = line1->fields[join_field_1].beg;
353 len1 = line1->fields[join_field_1].len;
361 if (join_field_2 < line2->nfields)
363 beg2 = line2->fields[join_field_2].beg;
364 len2 = line2->fields[join_field_2].len;
373 return len2 == 0 ? 0 : -1;
376 diff = memcmp (beg1, beg2, min (len1, len2));
382 /* Print field N of LINE if it exists and is nonempty, otherwise
383 `empty_filler' if it is nonempty. */
386 prfield (int n, struct line *line)
390 if (n < line->nfields)
392 len = line->fields[n].len;
394 fwrite (line->fields[n].beg, 1, len, stdout);
395 else if (empty_filler)
396 fputs (empty_filler, stdout);
398 else if (empty_filler)
399 fputs (empty_filler, stdout);
402 /* Print the join of LINE1 and LINE2. */
405 prjoin (struct line *line1, struct line *line2)
407 const struct outlist *outlist;
409 outlist = outlist_head.next;
412 const struct outlist *o;
422 if (line1 == &uni_blank)
425 field = join_field_2;
430 field = join_field_1;
435 line = (o->file == 1 ? line1 : line2);
438 prfield (field, line);
442 putchar (tab ? tab : ' ');
450 if (line1 == &uni_blank)
457 prfield (join_field_1, line1);
458 for (i = 0; i < join_field_1 && i < line1->nfields; ++i)
460 putchar (tab ? tab : ' ');
463 for (i = join_field_1 + 1; i < line1->nfields; ++i)
465 putchar (tab ? tab : ' ');
469 for (i = 0; i < join_field_2 && i < line2->nfields; ++i)
471 putchar (tab ? tab : ' ');
474 for (i = join_field_2 + 1; i < line2->nfields; ++i)
476 putchar (tab ? tab : ' ');
483 /* Print the join of the files in FP1 and FP2. */
486 join (FILE *fp1, FILE *fp2)
488 struct seq seq1, seq2;
490 int diff, i, j, eof1, eof2;
492 /* Read the first line of each file. */
498 while (seq1.count && seq2.count)
500 diff = keycmp (&seq1.lines[0], &seq2.lines[0]);
503 if (print_unpairables_1)
504 prjoin (&seq1.lines[0], &uni_blank);
505 freeline (&seq1.lines[0]);
512 if (print_unpairables_2)
513 prjoin (&uni_blank, &seq2.lines[0]);
514 freeline (&seq2.lines[0]);
520 /* Keep reading lines from file1 as long as they continue to
521 match the current line from file2. */
524 if (!getseq (fp1, &seq1))
530 while (!keycmp (&seq1.lines[seq1.count - 1], &seq2.lines[0]));
532 /* Keep reading lines from file2 as long as they continue to
533 match the current line from file1. */
536 if (!getseq (fp2, &seq2))
542 while (!keycmp (&seq1.lines[0], &seq2.lines[seq2.count - 1]));
546 for (i = 0; i < seq1.count - 1; ++i)
547 for (j = 0; j < seq2.count - 1; ++j)
548 prjoin (&seq1.lines[i], &seq2.lines[j]);
551 for (i = 0; i < seq1.count - 1; ++i)
552 freeline (&seq1.lines[i]);
555 seq1.lines[0] = seq1.lines[seq1.count - 1];
561 for (i = 0; i < seq2.count - 1; ++i)
562 freeline (&seq2.lines[i]);
565 seq2.lines[0] = seq2.lines[seq2.count - 1];
572 if (print_unpairables_1 && seq1.count)
574 prjoin (&seq1.lines[0], &uni_blank);
575 freeline (&seq1.lines[0]);
576 while (get_line (fp1, &line))
578 prjoin (&line, &uni_blank);
583 if (print_unpairables_2 && seq2.count)
585 prjoin (&uni_blank, &seq2.lines[0]);
586 freeline (&seq2.lines[0]);
587 while (get_line (fp2, &line))
589 prjoin (&uni_blank, &line);
598 /* Add a field spec for field FIELD of file FILE to `outlist'. */
601 add_field (int file, int field)
605 assert (file == 0 || file == 1 || file == 2);
608 o = (struct outlist *) xmalloc (sizeof (struct outlist));
613 /* Add to the end of the list so the fields are in the right order. */
614 outlist_end->next = o;
618 /* Convert a single field specifier string, S, to a *FILE_INDEX, *FIELD_INDEX
619 pair. In S, the field index string is 1-based; *FIELD_INDEX is zero-based.
620 If S is valid, return zero. Otherwise, give a diagnostic, don't update
621 *FILE_INDEX or *FIELD_INDEX, and return non-zero. */
624 decode_field_spec (const char *s, int *file_index, int *field_index)
628 /* The first character must be 0, 1, or 2. */
635 /* Leave *field_index undefined. */
640 /* `0' must be all alone -- no `.FIELD'. */
641 error (0, 0, _("invalid field specifier: `%s'"), s);
647 if (s[1] == '.' && s[2] != '\0')
652 s_err = xstrtol (s + 2, NULL, 10, &tmp_long, NULL);
653 if (s_err != LONGINT_OK || tmp_long <= 0 || tmp_long > INT_MAX)
655 error (0, 0, _("invalid field number: `%s'"), s + 2);
659 *file_index = s[0] - '0';
660 /* Convert to a zero-based index. */
661 *field_index = (int) tmp_long - 1;
668 error (0, 0, _("invalid file number in field spec: `%s'"), s);
674 /* Add the comma or blank separated field spec(s) in STR to `outlist'.
675 Return non-zero to indicate failure. */
678 add_field_list (const char *c_str)
682 /* Make a writable copy of c_str. */
683 str = (char *) alloca (strlen (c_str) + 1);
690 int file_index, field_index;
693 p = strpbrk (p, ", \t");
696 invalid = decode_field_spec (spec_item, &file_index, &field_index);
699 add_field (file_index, field_index);
700 uni_blank.nfields = max (uni_blank.nfields, field_index);
706 /* Create a blank line with COUNT fields separated by tabs. */
709 make_blank (struct line *blank, int count)
712 blank->nfields = count;
713 blank->beg = xmalloc (blank->nfields + 1);
714 blank->fields = (struct field *) xmalloc (sizeof (struct field) * count);
715 for (i = 0; i < blank->nfields; i++)
717 blank->beg[i] = '\t';
718 blank->fields[i].beg = &blank->beg[i];
719 blank->fields[i].len = 0;
721 blank->beg[i] = '\0';
722 blank->lim = &blank->beg[i];
726 main (int argc, char **argv)
730 int optc, prev_optc = 0, nfiles;
732 program_name = argv[0];
734 /* Initialize this before parsing options. In parsing options,
735 it may be increased. */
736 uni_blank.nfields = 1;
738 parse_long_options (argc, argv, "join", version_string, usage);
743 while ((optc = getopt_long_only (argc, argv, "-a:e:1:2:o:t:v:", longopts,
758 if (xstrtol (optarg, NULL, 10, &val, NULL) != LONGINT_OK
759 || (val != 1 && val != 2))
760 error (2, 0, _("invalid field number: `%s'"), optarg);
762 print_unpairables_1 = 1;
764 print_unpairables_2 = 1;
768 empty_filler = optarg;
772 if (xstrtol (optarg, NULL, 10, &val, NULL) != LONGINT_OK
773 || val <= 0 || val > INT_MAX)
775 error (2, 0, _("invalid field number for file 1: `%s'"), optarg);
777 join_field_1 = (int) val - 1;
781 if (xstrtol (optarg, NULL, 10, &val, NULL) != LONGINT_OK
782 || val <= 0 || val > INT_MAX)
783 error (2, 0, _("invalid field number for file 2: `%s'"), optarg);
784 join_field_2 = (int) val - 1;
788 if (xstrtol (optarg, NULL, 10, &val, NULL) != LONGINT_OK
789 || val <= 0 || val > INT_MAX)
790 error (2, 0, _("invalid field number: `%s'"), optarg);
791 join_field_1 = join_field_2 = (int) val - 1;
795 if (add_field_list (optarg))
803 case 1: /* Non-option argument. */
804 if (prev_optc == 'o' && optind <= argc - 2)
806 if (add_field_list (optarg))
809 /* Might be continuation of args to -o. */
810 continue; /* Don't change `prev_optc'. */
815 error (0, 0, "too many non-option arguments");
818 names[nfiles++] = optarg;
827 /* Now that we've seen the options, we can construct the blank line
829 make_blank (&uni_blank, uni_blank.nfields);
833 error (0, 0, "too few non-option arguments");
837 fp1 = strcmp (names[0], "-") ? fopen (names[0], "r") : stdin;
839 error (1, errno, "%s", names[0]);
840 fp2 = strcmp (names[1], "-") ? fopen (names[1], "r") : stdin;
842 error (1, errno, "%s", names[1]);
844 error (1, errno, _("both files cannot be standard input"));
847 if (fp1 != stdin && fclose (fp1) == EOF)
848 error (1, errno, "%s", names[0]);
849 if (fp2 != stdin && fclose (fp2) == EOF)
850 error (1, errno, "%s", names[1]);
851 if ((fp1 == stdin || fp2 == stdin) && fclose (stdin) == EOF)
852 error (1, errno, "-");
853 if (ferror (stdout) || fclose (stdout) == EOF)
854 error (1, errno, _("write error"));